From 68569dee1416593955c1570d638b3d9250b33012 Mon Sep 17 00:00:00 2001 From: trav90 Date: Mon, 15 Oct 2018 21:45:30 -0500 Subject: Import aom library This is the reference implementation for the Alliance for Open Media's av1 video code. The commit used was 4d668d7feb1f8abd809d1bca0418570a7f142a36. --- third_party/aom/aom_dsp/add_noise.c | 73 + third_party/aom/aom_dsp/ans.h | 44 + third_party/aom/aom_dsp/ansreader.h | 214 ++ third_party/aom/aom_dsp/answriter.h | 148 + third_party/aom/aom_dsp/aom_convolve.c | 854 +++++ third_party/aom/aom_dsp/aom_convolve.h | 57 + third_party/aom/aom_dsp/aom_dsp.cmake | 509 +++ third_party/aom/aom_dsp/aom_dsp.mk | 428 +++ third_party/aom/aom_dsp/aom_dsp_common.h | 107 + third_party/aom/aom_dsp/aom_dsp_rtcd.c | 16 + third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl | 1495 ++++++++ third_party/aom/aom_dsp/aom_filter.h | 43 + third_party/aom/aom_dsp/aom_simd.h | 37 + third_party/aom/aom_dsp/aom_simd_inline.h | 21 + .../aom/aom_dsp/arm/aom_convolve8_avg_neon.c | 364 ++ .../aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm | 295 ++ third_party/aom/aom_dsp/arm/aom_convolve8_neon.c | 331 ++ .../aom/aom_dsp/arm/aom_convolve8_neon_asm.asm | 273 ++ .../aom/aom_dsp/arm/aom_convolve_avg_neon.c | 145 + .../aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm | 119 + .../aom/aom_dsp/arm/aom_convolve_copy_neon.c | 93 + .../aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm | 87 + third_party/aom/aom_dsp/arm/aom_convolve_neon.c | 66 + third_party/aom/aom_dsp/arm/avg_neon.c | 254 ++ .../aom/aom_dsp/arm/bilinear_filter_media.asm | 240 ++ third_party/aom/aom_dsp/arm/fwd_txfm_neon.c | 221 ++ third_party/aom/aom_dsp/arm/hadamard_neon.c | 200 ++ .../aom/aom_dsp/arm/idct16x16_1_add_neon.asm | 201 ++ third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c | 59 + third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm | 1182 +++++++ third_party/aom/aom_dsp/arm/idct16x16_add_neon.c | 1295 +++++++ third_party/aom/aom_dsp/arm/idct16x16_neon.c | 152 + .../aom/aom_dsp/arm/idct32x32_1_add_neon.asm | 147 + third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c | 141 + third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm | 1302 +++++++ third_party/aom/aom_dsp/arm/idct32x32_add_neon.c | 686 ++++ third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm | 71 + third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c | 47 + third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm | 193 ++ third_party/aom/aom_dsp/arm/idct4x4_add_neon.c | 146 + third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm | 91 + third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c | 62 + third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm | 522 +++ third_party/aom/aom_dsp/arm/idct8x8_add_neon.c | 509 +++ third_party/aom/aom_dsp/arm/intrapred_neon.c | 757 ++++ third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm | 633 ++++ third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm | 202 ++ third_party/aom/aom_dsp/arm/loopfilter_16_neon.c | 174 + third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm | 252 ++ third_party/aom/aom_dsp/arm/loopfilter_4_neon.c | 250 ++ third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm | 428 +++ third_party/aom/aom_dsp/arm/loopfilter_8_neon.c | 430 +++ third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm | 638 ++++ third_party/aom/aom_dsp/arm/loopfilter_neon.c | 49 + third_party/aom/aom_dsp/arm/sad4d_neon.c | 225 ++ third_party/aom/aom_dsp/arm/sad_media.asm | 98 + third_party/aom/aom_dsp/arm/sad_neon.c | 224 ++ third_party/aom/aom_dsp/arm/save_reg_neon.asm | 39 + .../aom/aom_dsp/arm/subpel_variance_media.c | 81 + third_party/aom/aom_dsp/arm/subpel_variance_neon.c | 134 + third_party/aom/aom_dsp/arm/subtract_neon.c | 80 + .../arm/variance_halfpixvar16x16_h_media.asm | 185 + .../arm/variance_halfpixvar16x16_hv_media.asm | 225 ++ .../arm/variance_halfpixvar16x16_v_media.asm | 187 + third_party/aom/aom_dsp/arm/variance_media.asm | 361 ++ third_party/aom/aom_dsp/arm/variance_neon.c | 400 +++ third_party/aom/aom_dsp/avg.c | 232 ++ third_party/aom/aom_dsp/binary_codes_reader.c | 117 + third_party/aom/aom_dsp/binary_codes_reader.h | 38 + third_party/aom/aom_dsp/binary_codes_writer.c | 211 ++ third_party/aom/aom_dsp/binary_codes_writer.h | 70 + third_party/aom/aom_dsp/bitreader.h | 276 ++ third_party/aom/aom_dsp/bitreader_buffer.c | 47 + third_party/aom/aom_dsp/bitreader_buffer.h | 48 + third_party/aom/aom_dsp/bitwriter.h | 255 ++ third_party/aom/aom_dsp/bitwriter_buffer.c | 61 + third_party/aom/aom_dsp/bitwriter_buffer.h | 44 + third_party/aom/aom_dsp/blend.h | 42 + third_party/aom/aom_dsp/blend_a64_hmask.c | 71 + third_party/aom/aom_dsp/blend_a64_mask.c | 145 + third_party/aom/aom_dsp/blend_a64_vmask.c | 73 + third_party/aom/aom_dsp/buf_ans.c | 71 + third_party/aom/aom_dsp/buf_ans.h | 133 + third_party/aom/aom_dsp/daalaboolreader.c | 37 + third_party/aom/aom_dsp/daalaboolreader.h | 164 + third_party/aom/aom_dsp/daalaboolwriter.c | 32 + third_party/aom/aom_dsp/daalaboolwriter.h | 87 + third_party/aom/aom_dsp/dkboolreader.c | 110 + third_party/aom/aom_dsp/dkboolreader.h | 181 + third_party/aom/aom_dsp/dkboolwriter.c | 44 + third_party/aom/aom_dsp/dkboolwriter.h | 104 + third_party/aom/aom_dsp/entcode.c | 53 + third_party/aom/aom_dsp/entcode.h | 46 + third_party/aom/aom_dsp/entdec.c | 300 ++ third_party/aom/aom_dsp/entdec.h | 91 + third_party/aom/aom_dsp/entenc.c | 507 +++ third_party/aom/aom_dsp/entenc.h | 91 + third_party/aom/aom_dsp/fastssim.c | 493 +++ third_party/aom/aom_dsp/fwd_txfm.c | 809 +++++ third_party/aom/aom_dsp/fwd_txfm.h | 29 + third_party/aom/aom_dsp/intrapred.c | 971 ++++++ third_party/aom/aom_dsp/inv_txfm.c | 1445 ++++++++ third_party/aom/aom_dsp/inv_txfm.h | 91 + third_party/aom/aom_dsp/loopfilter.c | 900 +++++ third_party/aom/aom_dsp/mips/add_noise_msa.c | 60 + .../aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c | 704 ++++ .../aom/aom_dsp/mips/aom_convolve8_avg_msa.c | 605 ++++ .../aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c | 677 ++++ .../aom/aom_dsp/mips/aom_convolve8_horiz_msa.c | 692 ++++ third_party/aom/aom_dsp/mips/aom_convolve8_msa.c | 630 ++++ .../aom/aom_dsp/mips/aom_convolve8_vert_msa.c | 699 ++++ .../aom/aom_dsp/mips/aom_convolve_avg_msa.c | 233 ++ .../aom/aom_dsp/mips/aom_convolve_copy_msa.c | 248 ++ third_party/aom/aom_dsp/mips/aom_convolve_msa.h | 124 + third_party/aom/aom_dsp/mips/avg_msa.c | 57 + third_party/aom/aom_dsp/mips/common_dspr2.c | 31 + third_party/aom/aom_dsp/mips/common_dspr2.h | 49 + third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c | 256 ++ .../aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c | 802 +++++ third_party/aom/aom_dsp/mips/convolve2_dspr2.c | 1030 ++++++ .../aom/aom_dsp/mips/convolve2_horiz_dspr2.c | 681 ++++ .../aom/aom_dsp/mips/convolve2_vert_dspr2.c | 237 ++ third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c | 641 ++++ .../aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c | 998 ++++++ third_party/aom/aom_dsp/mips/convolve8_dspr2.c | 1590 +++++++++ .../aom/aom_dsp/mips/convolve8_horiz_dspr2.c | 878 +++++ .../aom/aom_dsp/mips/convolve8_vert_dspr2.c | 360 ++ .../aom/aom_dsp/mips/convolve_common_dspr2.h | 59 + third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c | 948 +++++ third_party/aom/aom_dsp/mips/fwd_txfm_msa.c | 246 ++ third_party/aom/aom_dsp/mips/fwd_txfm_msa.h | 381 ++ third_party/aom/aom_dsp/mips/idct16x16_msa.c | 486 +++ third_party/aom/aom_dsp/mips/idct32x32_msa.c | 730 ++++ third_party/aom/aom_dsp/mips/idct4x4_msa.c | 99 + third_party/aom/aom_dsp/mips/idct8x8_msa.c | 117 + third_party/aom/aom_dsp/mips/intrapred16_dspr2.c | 325 ++ third_party/aom/aom_dsp/mips/intrapred4_dspr2.c | 225 ++ third_party/aom/aom_dsp/mips/intrapred8_dspr2.c | 603 ++++ third_party/aom/aom_dsp/mips/intrapred_msa.c | 739 ++++ third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h | 80 + third_party/aom/aom_dsp/mips/inv_txfm_msa.h | 412 +++ third_party/aom/aom_dsp/mips/itrans16_dspr2.c | 1190 +++++++ third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c | 1042 ++++++ third_party/aom/aom_dsp/mips/itrans32_dspr2.c | 1030 ++++++ third_party/aom/aom_dsp/mips/itrans4_dspr2.c | 342 ++ third_party/aom/aom_dsp/mips/itrans8_dspr2.c | 645 ++++ third_party/aom/aom_dsp/mips/loopfilter_16_msa.c | 1487 ++++++++ third_party/aom/aom_dsp/mips/loopfilter_4_msa.c | 147 + third_party/aom/aom_dsp/mips/loopfilter_8_msa.c | 333 ++ .../aom/aom_dsp/mips/loopfilter_filters_dspr2.c | 327 ++ .../aom/aom_dsp/mips/loopfilter_filters_dspr2.h | 735 ++++ .../aom/aom_dsp/mips/loopfilter_macros_dspr2.h | 436 +++ .../aom/aom_dsp/mips/loopfilter_masks_dspr2.h | 356 ++ third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c | 589 ++++ .../aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c | 734 ++++ .../aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c | 757 ++++ third_party/aom/aom_dsp/mips/loopfilter_msa.h | 251 ++ third_party/aom/aom_dsp/mips/macros_msa.h | 2057 +++++++++++ third_party/aom/aom_dsp/mips/sad_msa.c | 1529 +++++++++ .../aom/aom_dsp/mips/sub_pixel_variance_msa.c | 1795 ++++++++++ third_party/aom/aom_dsp/mips/subtract_msa.c | 265 ++ third_party/aom/aom_dsp/mips/txfm_macros_msa.h | 97 + third_party/aom/aom_dsp/mips/variance_msa.c | 632 ++++ third_party/aom/aom_dsp/postproc.h | 26 + third_party/aom/aom_dsp/prob.c | 236 ++ third_party/aom/aom_dsp/prob.h | 198 ++ third_party/aom/aom_dsp/psnr.c | 373 ++ third_party/aom/aom_dsp/psnr.h | 79 + third_party/aom/aom_dsp/psnrhvs.c | 276 ++ third_party/aom/aom_dsp/quantize.c | 832 +++++ third_party/aom/aom_dsp/quantize.h | 120 + third_party/aom/aom_dsp/sad.c | 512 +++ third_party/aom/aom_dsp/simd/v128_intrinsics.h | 268 ++ third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h | 671 ++++ third_party/aom/aom_dsp/simd/v128_intrinsics_c.h | 707 ++++ third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h | 511 +++ third_party/aom/aom_dsp/simd/v256_intrinsics.h | 283 ++ third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h | 17 + third_party/aom/aom_dsp/simd/v256_intrinsics_c.h | 724 ++++ .../aom/aom_dsp/simd/v256_intrinsics_v128.h | 545 +++ third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h | 548 +++ third_party/aom/aom_dsp/simd/v64_intrinsics.h | 223 ++ third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h | 583 ++++ third_party/aom/aom_dsp/simd/v64_intrinsics_c.h | 919 +++++ third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h | 470 +++ third_party/aom/aom_dsp/ssim.c | 462 +++ third_party/aom/aom_dsp/ssim.h | 88 + third_party/aom/aom_dsp/subtract.c | 55 + third_party/aom/aom_dsp/sum_squares.c | 40 + third_party/aom/aom_dsp/txfm_common.h | 70 + third_party/aom/aom_dsp/variance.c | 1249 +++++++ third_party/aom/aom_dsp/variance.h | 132 + third_party/aom/aom_dsp/x86/aom_asm_stubs.c | 182 + .../aom/aom_dsp/x86/aom_convolve_copy_sse2.asm | 345 ++ .../aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm | 965 ++++++ .../x86/aom_high_subpixel_bilinear_sse2.asm | 497 +++ .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c | 575 ++++ .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c | 920 +++++ .../aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm | 990 ++++++ .../aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm | 883 +++++ .../aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm | 451 +++ .../aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm | 421 +++ third_party/aom/aom_dsp/x86/avg_intrin_sse2.c | 426 +++ third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm | 124 + third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c | 36 + third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c | 924 +++++ third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c | 285 ++ third_party/aom/aom_dsp/x86/blend_sse4.h | 146 + third_party/aom/aom_dsp/x86/convolve.h | 288 ++ third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c | 862 +++++ .../aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h | 3022 ++++++++++++++++ .../aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h | 3201 +++++++++++++++++ third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c | 24 + third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h | 35 + third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h | 1014 ++++++ third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c | 273 ++ third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h | 362 ++ .../aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm | 204 ++ .../aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm | 349 ++ .../aom/aom_dsp/x86/halfpix_variance_sse2.c | 77 + third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c | 1151 +++++++ .../aom/aom_dsp/x86/highbd_intrapred_sse2.asm | 456 +++ .../aom/aom_dsp/x86/highbd_loopfilter_sse2.c | 1140 ++++++ .../aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c | 155 + third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm | 290 ++ third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm | 366 ++ .../x86/highbd_subpel_variance_impl_sse2.asm | 1040 ++++++ third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c | 364 ++ .../aom/aom_dsp/x86/highbd_variance_impl_sse2.asm | 316 ++ third_party/aom/aom_dsp/x86/highbd_variance_sse2.c | 695 ++++ third_party/aom/aom_dsp/x86/highbd_variance_sse4.c | 216 ++ third_party/aom/aom_dsp/x86/intrapred_sse2.asm | 771 +++++ third_party/aom/aom_dsp/x86/intrapred_ssse3.asm | 410 +++ third_party/aom/aom_dsp/x86/inv_txfm_sse2.c | 3631 ++++++++++++++++++++ third_party/aom/aom_dsp/x86/inv_txfm_sse2.h | 265 ++ third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c | 1333 +++++++ third_party/aom/aom_dsp/x86/inv_wht_sse2.asm | 112 + third_party/aom/aom_dsp/x86/loopfilter_avx2.c | 915 +++++ third_party/aom/aom_dsp/x86/loopfilter_sse2.c | 1892 ++++++++++ .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.c | 334 ++ .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.c | 1948 +++++++++++ third_party/aom/aom_dsp/x86/obmc_sad_sse4.c | 262 ++ third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | 355 ++ .../aom/aom_dsp/x86/quantize_avx_x86_64.asm | 547 +++ third_party/aom/aom_dsp/x86/quantize_sse2.c | 249 ++ .../aom/aom_dsp/x86/quantize_ssse3_x86_64.asm | 349 ++ third_party/aom/aom_dsp/x86/sad4d_avx2.c | 216 ++ third_party/aom/aom_dsp/x86/sad4d_sse2.asm | 253 ++ third_party/aom/aom_dsp/x86/sad_avx2.c | 187 + third_party/aom/aom_dsp/x86/sad_highbd_avx2.c | 1043 ++++++ third_party/aom/aom_dsp/x86/sad_impl_avx2.c | 233 ++ third_party/aom/aom_dsp/x86/sad_sse2.asm | 345 ++ third_party/aom/aom_dsp/x86/sad_sse3.asm | 377 ++ third_party/aom/aom_dsp/x86/sad_sse4.asm | 362 ++ third_party/aom/aom_dsp/x86/sad_ssse3.asm | 373 ++ third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm | 219 ++ .../aom/aom_dsp/x86/subpel_variance_sse2.asm | 1489 ++++++++ third_party/aom/aom_dsp/x86/subtract_sse2.asm | 150 + third_party/aom/aom_dsp/x86/sum_squares_sse2.c | 210 ++ third_party/aom/aom_dsp/x86/synonyms.h | 120 + third_party/aom/aom_dsp/x86/txfm_common_avx2.h | 204 ++ third_party/aom/aom_dsp/x86/txfm_common_intrin.h | 31 + third_party/aom/aom_dsp/x86/txfm_common_sse2.h | 326 ++ third_party/aom/aom_dsp/x86/variance_avx2.c | 192 ++ third_party/aom/aom_dsp/x86/variance_impl_avx2.c | 713 ++++ third_party/aom/aom_dsp/x86/variance_sse2.c | 690 ++++ 266 files changed, 119012 insertions(+) create mode 100644 third_party/aom/aom_dsp/add_noise.c create mode 100644 third_party/aom/aom_dsp/ans.h create mode 100644 third_party/aom/aom_dsp/ansreader.h create mode 100644 third_party/aom/aom_dsp/answriter.h create mode 100644 third_party/aom/aom_dsp/aom_convolve.c create mode 100644 third_party/aom/aom_dsp/aom_convolve.h create mode 100644 third_party/aom/aom_dsp/aom_dsp.cmake create mode 100644 third_party/aom/aom_dsp/aom_dsp.mk create mode 100644 third_party/aom/aom_dsp/aom_dsp_common.h create mode 100644 third_party/aom/aom_dsp/aom_dsp_rtcd.c create mode 100755 third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl create mode 100644 third_party/aom/aom_dsp/aom_filter.h create mode 100644 third_party/aom/aom_dsp/aom_simd.h create mode 100644 third_party/aom/aom_dsp/aom_simd_inline.h create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_neon.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm create mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_neon.c create mode 100644 third_party/aom/aom_dsp/arm/avg_neon.c create mode 100644 third_party/aom/aom_dsp/arm/bilinear_filter_media.asm create mode 100644 third_party/aom/aom_dsp/arm/fwd_txfm_neon.c create mode 100644 third_party/aom/aom_dsp/arm/hadamard_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct32x32_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct4x4_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct8x8_add_neon.c create mode 100644 third_party/aom/aom_dsp/arm/intrapred_neon.c create mode 100644 third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_16_neon.c create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_4_neon.c create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_8_neon.c create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_neon.c create mode 100644 third_party/aom/aom_dsp/arm/sad4d_neon.c create mode 100644 third_party/aom/aom_dsp/arm/sad_media.asm create mode 100644 third_party/aom/aom_dsp/arm/sad_neon.c create mode 100644 third_party/aom/aom_dsp/arm/save_reg_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/subpel_variance_media.c create mode 100644 third_party/aom/aom_dsp/arm/subpel_variance_neon.c create mode 100644 third_party/aom/aom_dsp/arm/subtract_neon.c create mode 100644 third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm create mode 100644 third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm create mode 100644 third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm create mode 100644 third_party/aom/aom_dsp/arm/variance_media.asm create mode 100644 third_party/aom/aom_dsp/arm/variance_neon.c create mode 100644 third_party/aom/aom_dsp/avg.c create mode 100644 third_party/aom/aom_dsp/binary_codes_reader.c create mode 100644 third_party/aom/aom_dsp/binary_codes_reader.h create mode 100644 third_party/aom/aom_dsp/binary_codes_writer.c create mode 100644 third_party/aom/aom_dsp/binary_codes_writer.h create mode 100644 third_party/aom/aom_dsp/bitreader.h create mode 100644 third_party/aom/aom_dsp/bitreader_buffer.c create mode 100644 third_party/aom/aom_dsp/bitreader_buffer.h create mode 100644 third_party/aom/aom_dsp/bitwriter.h create mode 100644 third_party/aom/aom_dsp/bitwriter_buffer.c create mode 100644 third_party/aom/aom_dsp/bitwriter_buffer.h create mode 100644 third_party/aom/aom_dsp/blend.h create mode 100644 third_party/aom/aom_dsp/blend_a64_hmask.c create mode 100644 third_party/aom/aom_dsp/blend_a64_mask.c create mode 100644 third_party/aom/aom_dsp/blend_a64_vmask.c create mode 100644 third_party/aom/aom_dsp/buf_ans.c create mode 100644 third_party/aom/aom_dsp/buf_ans.h create mode 100644 third_party/aom/aom_dsp/daalaboolreader.c create mode 100644 third_party/aom/aom_dsp/daalaboolreader.h create mode 100644 third_party/aom/aom_dsp/daalaboolwriter.c create mode 100644 third_party/aom/aom_dsp/daalaboolwriter.h create mode 100644 third_party/aom/aom_dsp/dkboolreader.c create mode 100644 third_party/aom/aom_dsp/dkboolreader.h create mode 100644 third_party/aom/aom_dsp/dkboolwriter.c create mode 100644 third_party/aom/aom_dsp/dkboolwriter.h create mode 100644 third_party/aom/aom_dsp/entcode.c create mode 100644 third_party/aom/aom_dsp/entcode.h create mode 100644 third_party/aom/aom_dsp/entdec.c create mode 100644 third_party/aom/aom_dsp/entdec.h create mode 100644 third_party/aom/aom_dsp/entenc.c create mode 100644 third_party/aom/aom_dsp/entenc.h create mode 100644 third_party/aom/aom_dsp/fastssim.c create mode 100644 third_party/aom/aom_dsp/fwd_txfm.c create mode 100644 third_party/aom/aom_dsp/fwd_txfm.h create mode 100644 third_party/aom/aom_dsp/intrapred.c create mode 100644 third_party/aom/aom_dsp/inv_txfm.c create mode 100644 third_party/aom/aom_dsp/inv_txfm.h create mode 100644 third_party/aom/aom_dsp/loopfilter.c create mode 100644 third_party/aom/aom_dsp/mips/add_noise_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c create mode 100644 third_party/aom/aom_dsp/mips/aom_convolve_msa.h create mode 100644 third_party/aom/aom_dsp/mips/avg_msa.c create mode 100644 third_party/aom/aom_dsp/mips/common_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/common_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve2_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve8_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/convolve_common_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c create mode 100644 third_party/aom/aom_dsp/mips/fwd_txfm_msa.c create mode 100644 third_party/aom/aom_dsp/mips/fwd_txfm_msa.h create mode 100644 third_party/aom/aom_dsp/mips/idct16x16_msa.c create mode 100644 third_party/aom/aom_dsp/mips/idct32x32_msa.c create mode 100644 third_party/aom/aom_dsp/mips/idct4x4_msa.c create mode 100644 third_party/aom/aom_dsp/mips/idct8x8_msa.c create mode 100644 third_party/aom/aom_dsp/mips/intrapred16_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/intrapred4_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/intrapred8_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/intrapred_msa.c create mode 100644 third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/inv_txfm_msa.h create mode 100644 third_party/aom/aom_dsp/mips/itrans16_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/itrans32_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/itrans4_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/itrans8_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_16_msa.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_4_msa.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_8_msa.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c create mode 100644 third_party/aom/aom_dsp/mips/loopfilter_msa.h create mode 100644 third_party/aom/aom_dsp/mips/macros_msa.h create mode 100644 third_party/aom/aom_dsp/mips/sad_msa.c create mode 100644 third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c create mode 100644 third_party/aom/aom_dsp/mips/subtract_msa.c create mode 100644 third_party/aom/aom_dsp/mips/txfm_macros_msa.h create mode 100644 third_party/aom/aom_dsp/mips/variance_msa.c create mode 100644 third_party/aom/aom_dsp/postproc.h create mode 100644 third_party/aom/aom_dsp/prob.c create mode 100644 third_party/aom/aom_dsp/prob.h create mode 100644 third_party/aom/aom_dsp/psnr.c create mode 100644 third_party/aom/aom_dsp/psnr.h create mode 100644 third_party/aom/aom_dsp/psnrhvs.c create mode 100644 third_party/aom/aom_dsp/quantize.c create mode 100644 third_party/aom/aom_dsp/quantize.h create mode 100644 third_party/aom/aom_dsp/sad.c create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics.h create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_c.h create mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_c.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h create mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_c.h create mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h create mode 100644 third_party/aom/aom_dsp/ssim.c create mode 100644 third_party/aom/aom_dsp/ssim.h create mode 100644 third_party/aom/aom_dsp/subtract.c create mode 100644 third_party/aom/aom_dsp/sum_squares.c create mode 100644 third_party/aom/aom_dsp/txfm_common.h create mode 100644 third_party/aom/aom_dsp/variance.c create mode 100644 third_party/aom/aom_dsp/variance.h create mode 100644 third_party/aom/aom_dsp/x86/aom_asm_stubs.c create mode 100644 third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/avg_intrin_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/blend_sse4.h create mode 100644 third_party/aom/aom_dsp/x86/convolve.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/intrapred_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/inv_wht_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/loopfilter_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/loopfilter_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/obmc_sad_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/obmc_variance_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/quantize_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/sad4d_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad4d_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/sad_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad_highbd_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad_impl_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sad_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/sad_sse3.asm create mode 100644 third_party/aom/aom_dsp/x86/sad_sse4.asm create mode 100644 third_party/aom/aom_dsp/x86/sad_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/subtract_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/sum_squares_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/synonyms.h create mode 100644 third_party/aom/aom_dsp/x86/txfm_common_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/txfm_common_intrin.h create mode 100644 third_party/aom/aom_dsp/x86/txfm_common_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/variance_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/variance_impl_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/variance_sse2.c (limited to 'third_party/aom/aom_dsp') diff --git a/third_party/aom/aom_dsp/add_noise.c b/third_party/aom/aom_dsp/add_noise.c new file mode 100644 index 000000000..389cf2049 --- /dev/null +++ b/third_party/aom/aom_dsp/add_noise.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16], + char whiteclamp[16], char bothclamp[16], + unsigned int width, unsigned int height, int pitch) { + unsigned int i, j; + + for (i = 0; i < height; ++i) { + uint8_t *pos = start + i * pitch; + char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT + + for (j = 0; j < width; ++j) { + int v = pos[j]; + + v = clamp(v - blackclamp[0], 0, 255); + v = clamp(v + bothclamp[0], 0, 255); + v = clamp(v - whiteclamp[0], 0, 255); + + pos[j] = v + ref[j]; + } + } +} + +static double gaussian(double sigma, double mu, double x) { + return 1 / (sigma * sqrt(2.0 * 3.14159265)) * + (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); +} + +int aom_setup_noise(double sigma, int size, char *noise) { + char char_dist[256]; + int next = 0, i, j; + + // set up a 256 entry lookup that matches gaussian distribution + for (i = -32; i < 32; ++i) { + const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i)); + if (a_i) { + for (j = 0; j < a_i; ++j) { + char_dist[next + j] = (char)i; + } + next = next + j; + } + } + + // Rounding error - might mean we have less than 256. + for (; next < 256; ++next) { + char_dist[next] = 0; + } + + for (i = 0; i < size; ++i) { + noise[i] = char_dist[rand() & 0xff]; // NOLINT + } + + // Returns the highest non 0 value used in distribution. + return -char_dist[0]; +} diff --git a/third_party/aom/aom_dsp/ans.h b/third_party/aom/aom_dsp/ans.h new file mode 100644 index 000000000..a7a2f0eab --- /dev/null +++ b/third_party/aom/aom_dsp/ans.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_ANS_H_ +#define AOM_DSP_ANS_H_ +// Constants, types and utilities for Asymmetric Numeral Systems +// http://arxiv.org/abs/1311.2540v2 + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/prob.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// Use windowed ANS, size is passed in at initialization +#define ANS_MAX_SYMBOLS 1 +#define ANS_REVERSE 1 + +typedef uint8_t AnsP8; +#define ANS_P8_PRECISION 256u +#define ANS_P8_SHIFT 8 +#define RANS_PROB_BITS 15 +#define RANS_PRECISION (1u << RANS_PROB_BITS) + +// L_BASE is the ANS base state. L_BASE % PRECISION must be 0. +#define L_BASE (1u << 17) +#define IO_BASE 256 +// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 } + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_DSP_ANS_H_ diff --git a/third_party/aom/aom_dsp/ansreader.h b/third_party/aom/aom_dsp/ansreader.h new file mode 100644 index 000000000..e50c63b2d --- /dev/null +++ b/third_party/aom/aom_dsp/ansreader.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_ANSREADER_H_ +#define AOM_DSP_ANSREADER_H_ +// An implementation of Asymmetric Numeral Systems +// http://arxiv.org/abs/1311.2540v2 +// Implements decoding of: +// * rABS (range Asymmetric Binary Systems), a boolean coder +// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/prob.h" +#include "aom_dsp/ans.h" +#include "aom_ports/mem_ops.h" +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct AnsDecoder { + const uint8_t *buf; + int buf_offset; + uint32_t state; +#if ANS_MAX_SYMBOLS + int symbols_left; + int window_size; +#endif +#if CONFIG_ACCOUNTING + Accounting *accounting; +#endif +}; + +static INLINE int ans_read_reinit(struct AnsDecoder *const ans); + +static INLINE unsigned refill_state(struct AnsDecoder *const ans, + unsigned state) { +#if ANS_REVERSE + while (state < L_BASE && ans->buf_offset < 0) { + state = state * IO_BASE + ans->buf[ans->buf_offset++]; + } +#else + while (state < L_BASE && ans->buf_offset > 0) { + state = state * IO_BASE + ans->buf[--ans->buf_offset]; + } +#endif + return state; +} + +// Decode one rABS encoded boolean where the probability of the value being zero +// is p0. +static INLINE int rabs_read(struct AnsDecoder *ans, AnsP8 p0) { +#if ANS_MAX_SYMBOLS + if (ans->symbols_left-- == 0) { + ans_read_reinit(ans); + ans->symbols_left--; + } +#endif + unsigned state = refill_state(ans, ans->state); + const unsigned quotient = state / ANS_P8_PRECISION; + const unsigned remainder = state % ANS_P8_PRECISION; + const int value = remainder >= p0; + const unsigned qp0 = quotient * p0; + if (value) + state = state - qp0 - p0; + else + state = qp0 + remainder; + ans->state = state; + return value; +} + +// Decode one rABS encoded boolean where the probability of the value being zero +// is one half. +static INLINE int rabs_read_bit(struct AnsDecoder *ans) { +#if ANS_MAX_SYMBOLS + if (ans->symbols_left-- == 0) { + ans_read_reinit(ans); + ans->symbols_left--; + } +#endif + unsigned state = refill_state(ans, ans->state); + const int value = !!(state & 0x80); + ans->state = ((state >> 1) & ~0x7F) | (state & 0x7F); + return value; +} + +struct rans_dec_sym { + uint8_t val; + aom_cdf_prob prob; + aom_cdf_prob cum_prob; // not-inclusive +}; + +static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf, + aom_cdf_prob rem) { + int i; + aom_cdf_prob cum_prob = 0, top_prob; + // TODO(skal): if critical, could be a binary search. + // Or, better, an O(1) alias-table. + for (i = 0; rem >= (top_prob = cdf[i]); ++i) { + cum_prob = top_prob; + } + out->val = i; + out->prob = top_prob - cum_prob; + out->cum_prob = cum_prob; +} + +static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) { + unsigned rem; + unsigned quo; + struct rans_dec_sym sym; +#if ANS_MAX_SYMBOLS + if (ans->symbols_left-- == 0) { + ans_read_reinit(ans); + ans->symbols_left--; + } +#endif + ans->state = refill_state(ans, ans->state); + quo = ans->state / RANS_PRECISION; + rem = ans->state % RANS_PRECISION; + fetch_sym(&sym, tab, rem); + ans->state = quo * sym.prob + rem - sym.cum_prob; + return sym.val; +} + +static INLINE int ans_read_init(struct AnsDecoder *const ans, + const uint8_t *const buf, int offset) { + unsigned x; + if (offset < 1) return 1; +#if ANS_REVERSE + ans->buf = buf + offset; + ans->buf_offset = -offset; + x = buf[0]; + if ((x & 0x80) == 0) { // Marker is 0xxx xxxx + if (offset < 2) return 1; + ans->buf_offset += 2; + ans->state = mem_get_be16(buf) & 0x7FFF; +#if L_BASE * IO_BASE > (1 << 23) + } else if ((x & 0xC0) == 0x80) { // Marker is 10xx xxxx + if (offset < 3) return 1; + ans->buf_offset += 3; + ans->state = mem_get_be24(buf) & 0x3FFFFF; + } else { // Marker is 11xx xxxx + if (offset < 4) return 1; + ans->buf_offset += 4; + ans->state = mem_get_be32(buf) & 0x3FFFFFFF; +#else + } else { // Marker is 1xxx xxxx + if (offset < 3) return 1; + ans->buf_offset += 3; + ans->state = mem_get_be24(buf) & 0x7FFFFF; +#endif + } +#else + ans->buf = buf; + x = buf[offset - 1]; + if ((x & 0x80) == 0) { // Marker is 0xxx xxxx + if (offset < 2) return 1; + ans->buf_offset = offset - 2; + ans->state = mem_get_le16(buf + offset - 2) & 0x7FFF; + } else if ((x & 0xC0) == 0x80) { // Marker is 10xx xxxx + if (offset < 3) return 1; + ans->buf_offset = offset - 3; + ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF; + } else if ((x & 0xE0) == 0xE0) { // Marker is 111x xxxx + if (offset < 4) return 1; + ans->buf_offset = offset - 4; + ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF; + } else { + // Marker 110x xxxx implies this byte is a superframe marker + return 1; + } +#endif // ANS_REVERSE +#if CONFIG_ACCOUNTING + ans->accounting = NULL; +#endif + ans->state += L_BASE; + if (ans->state >= L_BASE * IO_BASE) return 1; +#if ANS_MAX_SYMBOLS + assert(ans->window_size > 1); + ans->symbols_left = ans->window_size; +#endif + return 0; +} + +#if ANS_REVERSE +static INLINE int ans_read_reinit(struct AnsDecoder *const ans) { + return ans_read_init(ans, ans->buf + ans->buf_offset, -ans->buf_offset); +} +#endif + +static INLINE int ans_read_end(const struct AnsDecoder *const ans) { + return ans->buf_offset == 0 && ans->state < L_BASE; +} + +static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) { + return ans->state < L_BASE / RANS_PRECISION; +} +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_DSP_ANSREADER_H_ diff --git a/third_party/aom/aom_dsp/answriter.h b/third_party/aom/aom_dsp/answriter.h new file mode 100644 index 000000000..353acf1a9 --- /dev/null +++ b/third_party/aom/aom_dsp/answriter.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_ANSWRITER_H_ +#define AOM_DSP_ANSWRITER_H_ +// An implementation of Asymmetric Numeral Systems +// http://arxiv.org/abs/1311.2540v2 +// Implements encoding of: +// * rABS (range Asymmetric Binary Systems), a boolean coder +// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/ans.h" +#include "aom_dsp/prob.h" +#include "aom_ports/mem_ops.h" +#include "av1/common/odintrin.h" + +#if RANS_PRECISION <= OD_DIVU_DMAX +#define ANS_DIVREM(quotient, remainder, dividend, divisor) \ + do { \ + quotient = OD_DIVU_SMALL((dividend), (divisor)); \ + remainder = (dividend) - (quotient) * (divisor); \ + } while (0) +#else +#define ANS_DIVREM(quotient, remainder, dividend, divisor) \ + do { \ + quotient = (dividend) / (divisor); \ + remainder = (dividend) % (divisor); \ + } while (0) +#endif + +#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor)) + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +struct AnsCoder { + uint8_t *buf; + int buf_offset; + uint32_t state; +}; + +static INLINE void ans_write_init(struct AnsCoder *const ans, + uint8_t *const buf) { + ans->buf = buf; + ans->buf_offset = 0; + ans->state = L_BASE; +} + +static INLINE int ans_write_end(struct AnsCoder *const ans) { + uint32_t state; + int ans_size; + assert(ans->state >= L_BASE); + assert(ans->state < L_BASE * IO_BASE); + state = ans->state - L_BASE; + if (state < (1u << 15)) { + mem_put_le16(ans->buf + ans->buf_offset, (0x00u << 15) + state); + ans_size = ans->buf_offset + 2; +#if ANS_REVERSE +#if L_BASE * IO_BASE > (1 << 23) + } else if (state < (1u << 22)) { + mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state); + ans_size = ans->buf_offset + 3; + } else if (state < (1u << 30)) { + mem_put_le32(ans->buf + ans->buf_offset, (0x03u << 30) + state); + ans_size = ans->buf_offset + 4; +#else + } else if (state < (1u << 23)) { + mem_put_le24(ans->buf + ans->buf_offset, (0x01u << 23) + state); + ans_size = ans->buf_offset + 3; +#endif +#else + } else if (state < (1u << 22)) { + mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state); + ans_size = ans->buf_offset + 3; + } else if (state < (1u << 29)) { + mem_put_le32(ans->buf + ans->buf_offset, (0x07u << 29) + state); + ans_size = ans->buf_offset + 4; +#endif + } else { + assert(0 && "State is too large to be serialized"); + return ans->buf_offset; + } +#if ANS_REVERSE + { + int i; + uint8_t tmp; + for (i = 0; i < (ans_size >> 1); i++) { + tmp = ans->buf[i]; + ans->buf[i] = ans->buf[ans_size - 1 - i]; + ans->buf[ans_size - 1 - i] = tmp; + } + ans->buf += ans_size; + ans->buf_offset = 0; + ans->state = L_BASE; + } +#endif + return ans_size; +} + +// Write one boolean using rABS where p0 is the probability of the value being +// zero. +static INLINE void rabs_write(struct AnsCoder *ans, int value, AnsP8 p0) { + const AnsP8 p = ANS_P8_PRECISION - p0; + const unsigned l_s = value ? p : p0; + unsigned state = ans->state; + while (state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) { + ans->buf[ans->buf_offset++] = state % IO_BASE; + state /= IO_BASE; + } + const unsigned quotient = ANS_DIV8(state, l_s); + const unsigned remainder = state - quotient * l_s; + ans->state = quotient * ANS_P8_PRECISION + remainder + (value ? p0 : 0); +} + +// Encode one symbol using rANS. +// cum_prob: The cumulative probability before this symbol (the offset of +// the symbol in the symbol cycle) +// prob: The probability of this symbol (l_s from the paper) +// RANS_PRECISION takes the place of m from the paper. +static INLINE void rans_write(struct AnsCoder *ans, aom_cdf_prob cum_prob, + aom_cdf_prob prob) { + unsigned quotient, remainder; + while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * prob) { + ans->buf[ans->buf_offset++] = ans->state % IO_BASE; + ans->state /= IO_BASE; + } + ANS_DIVREM(quotient, remainder, ans->state, prob); + ans->state = quotient * RANS_PRECISION + remainder + cum_prob; +} + +#undef ANS_DIV8 +#undef ANS_DIVREM +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_DSP_ANSWRITER_H_ diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c new file mode 100644 index 000000000..74f4c00fb --- /dev/null +++ b/third_party/aom/aom_dsp/aom_convolve.c @@ -0,0 +1,854 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), + 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, int y0_q4, + int y_step_q4, int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, + MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, + intermediate_height); + convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, + dst_stride, y_filters, y0_q4, y_step_q4, w, h); +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + w, h); +} + +void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h); +} + +void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, + w, h); +} + +void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h); +} + +void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h); +} + +void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]); + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w, + h); +} + +void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int filter_x_stride, const int16_t *filter_y, + int filter_y_stride, int w, int h) { + int r; + + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int filter_x_stride, const int16_t *filter_y, + int filter_y_stride, int w, int h) { + int x, y; + + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + + src += src_stride; + dst += dst_stride; + } +} + +void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); +} + +#if CONFIG_LOOP_RESTORATION +static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + + src_x[SUBPEL_TAPS / 2 - 1]); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + + src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h) { + uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, + intermediate_height); + convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, + dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h); +} + +void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h); +} + +void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h); +} + +void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, filters_y, y0_q4, y_step_q4, w, h); +} +#endif // CONFIG_LOOP_RESTORATION + +#if CONFIG_HIGHBITDEPTH +static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), + 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), + 1); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h, int bd) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4, + x_step_q4, w, intermediate_height, bd); + highbd_convolve_vert( + CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, + filters_y, y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + // Fixed size intermediate buffer places limits on parameters. + DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]); + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, + filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); + aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst, + dst_stride, NULL, 0, NULL, 0, w, h, bd); +} + +void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + int r; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w * sizeof(uint16_t)); + src += src_stride; + dst += dst_stride; + } +} + +void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + } + src += src_stride; + dst += dst_stride; + } +} + +#if CONFIG_LOOP_RESTORATION +static void highbd_convolve_add_src_horiz(const uint8_t *src8, + ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, + const InterpKernel *x_filters, + int x0_q4, int x_step_q4, int w, + int h, int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel_highbd( + ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1], + bd); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_add_src_vert(const uint8_t *src8, + ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, + const InterpKernel *y_filters, + int y0_q4, int y_step_q4, int w, int h, + int bd) { + int x, y; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + + src_y[(SUBPEL_TAPS / 2 - 1) * src_stride], + bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, + int x0_q4, int x_step_q4, + const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h, + int bd) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, + intermediate_height, bd); + highbd_convolve_add_src_vert( + CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_horiz_c( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd); +} +#endif // CONFIG_LOOP_RESTORATION +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/aom_convolve.h b/third_party/aom/aom_dsp/aom_convolve.h new file mode 100644 index 000000000..d0de6c5d2 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_convolve.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_DSP_AOM_CONVOLVE_H_ +#define AOM_DSP_AOM_CONVOLVE_H_ + +#include "./aom_config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Note: Fixed size intermediate buffers, place limits on parameters +// of some functions. 2d filtering proceeds in 2 steps: +// (1) Interpolate horizontally into an intermediate buffer, temp. +// (2) Interpolate temp vertically to derive the sub-pixel result. +// Deriving the maximum number of rows in the temp buffer (135): +// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). +// --Largest block size is 64x64 pixels. +// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the +// original frame (in 1/16th pixel units). +// --Must round-up because block may be located at sub-pixel position. +// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. +// --((64 - 1) * 32 + 15) >> 4 + 8 = 135. +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +#define MAX_EXT_SIZE 263 +#else +#define MAX_EXT_SIZE 135 +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION + +typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +#if CONFIG_HIGHBITDEPTH +typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_AOM_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake new file mode 100644 index 000000000..f00348cbc --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -0,0 +1,509 @@ +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## +set(AOM_DSP_COMMON_SOURCES + "${AOM_ROOT}/aom_dsp/aom_convolve.c" + "${AOM_ROOT}/aom_dsp/aom_convolve.h" + "${AOM_ROOT}/aom_dsp/aom_dsp_common.h" + "${AOM_ROOT}/aom_dsp/aom_filter.h" + "${AOM_ROOT}/aom_dsp/aom_simd.h" + "${AOM_ROOT}/aom_dsp/aom_simd_inline.h" + "${AOM_ROOT}/aom_dsp/blend.h" + "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c" + "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" + "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c" + "${AOM_ROOT}/aom_dsp/intrapred.c" + "${AOM_ROOT}/aom_dsp/loopfilter.c" + "${AOM_ROOT}/aom_dsp/prob.c" + "${AOM_ROOT}/aom_dsp/prob.h" + "${AOM_ROOT}/aom_dsp/sad.c" + "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/subtract.c" + "${AOM_ROOT}/aom_dsp/txfm_common.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_intrin.h") + +set(AOM_DSP_COMMON_ASM_SSE2 + "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm") + +set(AOM_DSP_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" + "${AOM_ROOT}/aom_dsp/x86/convolve.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c") + +set(AOM_DSP_COMMON_ASM_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm" + "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.asm") + +set(AOM_DSP_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c") + +set(AOM_DSP_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c") + +set(AOM_DSP_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c") + +set(AOM_DSP_COMMON_ASM_NEON + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm") + +set(AOM_DSP_COMMON_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c" + "${AOM_ROOT}/aom_dsp/arm/avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" + "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c" + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c" + "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c" + "${AOM_ROOT}/aom_dsp/arm/sad_neon.c" + "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" + "${AOM_ROOT}/aom_dsp/arm/variance_neon.c") + +if ("${AOM_TARGET_CPU}" STREQUAL "arm64") + set(AOM_DSP_COMMON_INTRIN_NEON + ${AOM_DSP_COMMON_INTRIN_NEON} + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c" + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c") +endif () + +set(AOM_DSP_COMMON_INTRIN_DSPR2 + "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c") + +set(AOM_DSP_COMMON_INTRIN_MSA + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_vert_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve_avg_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h" + "${AOM_ROOT}/aom_dsp/mips/fwd_dct32x32_msa.c" + "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.c" + "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.h" + "${AOM_ROOT}/aom_dsp/mips/idct16x16_msa.c" + "${AOM_ROOT}/aom_dsp/mips/idct32x32_msa.c" + "${AOM_ROOT}/aom_dsp/mips/idct4x4_msa.c" + "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c" + "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c" + "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h" + "${AOM_ROOT}/aom_dsp/mips/macros_msa.h" + "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h") + +if (CONFIG_HIGHBITDEPTH) + set(AOM_DSP_COMMON_ASM_SSE2 + ${AOM_DSP_COMMON_ASM_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm") + + set(AOM_DSP_COMMON_INTRIN_SSE2 + ${AOM_DSP_COMMON_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c") + + set(AOM_DSP_COMMON_INTRIN_AVX2 + ${AOM_DSP_COMMON_INTRIN_AVX2} + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c") +else () + set(AOM_DSP_COMMON_INTRIN_DSPR2 + ${AOM_DSP_COMMON_INTRIN_DSPR2} + "${AOM_ROOT}/aom_dsp/mips/itrans16_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/itrans32_cols_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/itrans32_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/itrans4_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/itrans8_dspr2.c") +endif () + +if (CONFIG_ANS) + set(AOM_DSP_COMMON_SOURCES + ${AOM_DSP_COMMON_SOURCES} + "${AOM_ROOT}/aom_dsp/ans.h") +elseif (CONFIG_DAALA_EC) + set(AOM_DSP_COMMON_SOURCES + ${AOM_DSP_COMMON_SOURCES} + "${AOM_ROOT}/aom_dsp/entcode.c" + "${AOM_ROOT}/aom_dsp/entcode.h") +endif () + +if (CONFIG_AV1) + set(AOM_DSP_COMMON_SOURCES + ${AOM_DSP_COMMON_SOURCES} + "${AOM_ROOT}/aom_dsp/inv_txfm.c" + "${AOM_ROOT}/aom_dsp/inv_txfm.h") + + set(AOM_DSP_COMMON_ASM_SSE2 + ${AOM_DSP_COMMON_ASM_SSE2} + "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm") + + set(AOM_DSP_COMMON_INTRIN_SSE2 + ${AOM_DSP_COMMON_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.h") +endif () + +if (CONFIG_DECODERS) + set(AOM_DSP_DECODER_SOURCES + "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" + "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" + "${AOM_ROOT}/aom_dsp/bitreader.h" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.h") + + if (CONFIG_ANS) + set(AOM_DSP_DECODER_SOURCES + ${AOM_DSP_DECODER_SOURCES} + "${AOM_ROOT}/aom_dsp/ansreader.h") + elseif (CONFIG_DAALA_EC) + set(AOM_DSP_DECODER_SOURCES + ${AOM_DSP_DECODER_SOURCES} + "${AOM_ROOT}/aom_dsp/daalaboolreader.c" + "${AOM_ROOT}/aom_dsp/daalaboolreader.h" + "${AOM_ROOT}/aom_dsp/entdec.c" + "${AOM_ROOT}/aom_dsp/entdec.h") + else () + set(AOM_DSP_DECODER_SOURCES + ${AOM_DSP_DECODER_SOURCES} + "${AOM_ROOT}/aom_dsp/dkboolreader.c" + "${AOM_ROOT}/aom_dsp/dkboolreader.h") + endif () +endif () + +if (CONFIG_ENCODERS) + set(AOM_DSP_ENCODER_SOURCES + "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" + "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" + "${AOM_ROOT}/aom_dsp/bitwriter.h" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" + "${AOM_ROOT}/aom_dsp/psnr.c" + "${AOM_ROOT}/aom_dsp/psnr.h" + "${AOM_ROOT}/aom_dsp/variance.c" + "${AOM_ROOT}/aom_dsp/variance.h") + + set(AOM_DSP_ENCODER_ASM_SSE2 + ${AOM_DSP_ENCODER_ASM_SSE2} + "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm") + + set(AOM_DSP_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c") + + set(AOM_DSP_ENCODER_ASM_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/sad_ssse3.asm") + + set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64 + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" + "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm") + + set(AOM_DSP_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/aom_dsp/x86/sad_sse3.asm") + set(AOM_DSP_ENCODER_ASM_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sad_sse4.asm") + + set(AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c") + + if (CONFIG_AV1_ENCODER) + set(AOM_DSP_ENCODER_SOURCES + ${AOM_DSP_ENCODER_SOURCES} + "${AOM_ROOT}/aom_dsp/avg.c" + "${AOM_ROOT}/aom_dsp/fwd_txfm.c" + "${AOM_ROOT}/aom_dsp/fwd_txfm.h" + "${AOM_ROOT}/aom_dsp/quantize.c" + "${AOM_ROOT}/aom_dsp/quantize.h" + "${AOM_ROOT}/aom_dsp/sum_squares.c") + + set(AOM_DSP_ENCODER_INTRIN_SSE2 + ${AOM_DSP_ENCODER_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_dct32_8cols_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c") + + set(AOM_DSP_ENCODER_INTRIN_SSSE3 + ${AOM_DSP_ENCODER_INTRIN_SSSE3} + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c") + + set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64 + ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64} + "${AOM_ROOT}/aom_dsp/x86/avg_ssse3_x86_64.asm" + "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") + + set(AOM_DSP_ENCODER_AVX_ASM_X86_64 + ${AOM_DSP_ENCODER_AVX_ASM_X86_64} + "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm") + + set(AOM_DSP_ENCODER_INTRIN_MSA + "${AOM_ROOT}/aom_dsp/mips/avg_msa.c" + "${AOM_ROOT}/aom_dsp/mips/sad_msa.c" + "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c" + "${AOM_ROOT}/aom_dsp/mips/variance_msa.c" + "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c") + + if (CONFIG_HIGHBITDEPTH) + set(AOM_DSP_ENCODER_INTRIN_SSE2 + ${AOM_DSP_ENCODER_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c") + endif () + endif () + + if (CONFIG_HIGHBITDEPTH) + set(AOM_DSP_ENCODER_ASM_SSE2 + ${AOM_DSP_ENCODER_ASM_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm") + + set(AOM_DSP_ENCODER_INTRIN_SSE2 + ${AOM_DSP_ENCODER_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c") + + set(AOM_DSP_ENCODER_INTRIN_SSE4_1 + ${AOM_DSP_ENCODER_INTRIN_SSE4_1} + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c") + + set(AOM_DSP_ENCODER_INTRIN_AVX2 + ${AOM_DSP_ENCODER_INTRIN_AVX2} + "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c") + endif () + + if (CONFIG_ANS) + set(AOM_DSP_ENCODER_SOURCES + ${AOM_DSP_ENCODER_SOURCES} + "${AOM_ROOT}/aom_dsp/answriter.h" + "${AOM_ROOT}/aom_dsp/buf_ans.c" + "${AOM_ROOT}/aom_dsp/buf_ans.h") + elseif (CONFIG_DAALA_EC) + set(AOM_DSP_ENCODER_SOURCES + ${AOM_DSP_ENCODER_SOURCES} + "${AOM_ROOT}/aom_dsp/daalaboolwriter.c" + "${AOM_ROOT}/aom_dsp/daalaboolwriter.h" + "${AOM_ROOT}/aom_dsp/entenc.c" + "${AOM_ROOT}/aom_dsp/entenc.h") + else () + set(AOM_DSP_ENCODER_SOURCES + ${AOM_DSP_ENCODER_SOURCES} + "${AOM_ROOT}/aom_dsp/dkboolwriter.c" + "${AOM_ROOT}/aom_dsp/dkboolwriter.h") + endif () + + if (CONFIG_INTERNAL_STATS) + set(AOM_DSP_ENCODER_SOURCES + ${AOM_DSP_ENCODER_SOURCES} + "${AOM_ROOT}/aom_dsp/fastssim.c" + "${AOM_ROOT}/aom_dsp/psnrhvs.c" + "${AOM_ROOT}/aom_dsp/ssim.c" + "${AOM_ROOT}/aom_dsp/ssim.h") + endif () +endif () + +if (CONFIG_MOTION_VAR) + set(AOM_DSP_ENCODER_INTRIN_SSE4_1 + ${AOM_DSP_ENCODER_INTRIN_SSE4_1} + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") +endif () + +# Creates aom_dsp build targets. Must not be called until after libaom target +# has been created. +function (setup_aom_dsp_targets) + add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_common) + target_sources(aom PUBLIC $) + + if (CONFIG_DECODERS) + add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_decoder) + target_sources(aom PUBLIC $) + endif () + + if (CONFIG_ENCODERS) + add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES}) + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_encoder) + target_sources(aom PUBLIC $) + endif () + + if (HAVE_SSE2) + add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom") + add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSE2") + if (CONFIG_ENCODERS) + add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom") + add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSE2") + endif() + endif () + + if (HAVE_SSE3 AND CONFIG_ENCODERS) + add_asm_library("aom_dsp_encoder_sse3" "AOM_DSP_ENCODER_INTRIN_SSE3" "aom") + endif () + + if (HAVE_SSSE3) + add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom") + add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSSE3") + + if (CONFIG_ENCODERS) + if ("${AOM_TARGET_CPU}" STREQUAL "x86_64") + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 + ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}) + endif () + add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom") + add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSSE3") + endif () + endif () + + if (HAVE_SSE4_1) + add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SSE4_1") + if (CONFIG_ENCODERS) + if (AOM_DSP_ENCODER_INTRIN_SSE4_1) + add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSE4_1") + endif () + add_asm_library("aom_dsp_encoder_sse4_1" "AOM_DSP_ENCODER_ASM_SSE4_1" + "aom") + endif () + endif () + + if (HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") + add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64" + "aom") + endif () + + if (HAVE_AVX2) + add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_AVX2") + if (CONFIG_ENCODERS) + add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_AVX2") + endif () + endif () + + if (HAVE_NEON_ASM) + if (AOM_ADS2GAS_REQUIRED) + add_gas_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom") + else () + add_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom") + endif () + endif () + + if (HAVE_NEON) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON") + endif () + + if (HAVE_DSPR2) + add_intrinsics_object_library("" "dspr2" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_DSPR2") + endif () + + if (HAVE_MSA) + add_intrinsics_object_library("" "msa" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_MSA") + if (CONFIG_ENCODERS) + add_intrinsics_object_library("" "msa" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_MSA") + endif () + endif () + + # Pass the new lib targets up to the parent scope instance of + # $AOM_LIB_TARGETS. + set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) +endfunction () diff --git a/third_party/aom/aom_dsp/aom_dsp.mk b/third_party/aom/aom_dsp/aom_dsp.mk new file mode 100644 index 000000000..8c7241b83 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp.mk @@ -0,0 +1,428 @@ +## +## Copyright (c) 2016, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## + + +DSP_SRCS-yes += aom_dsp.mk +DSP_SRCS-yes += aom_dsp_common.h + +DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h + +DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/synonyms.h + +# bit reader +DSP_SRCS-yes += prob.h +DSP_SRCS-yes += prob.c +DSP_SRCS-$(CONFIG_ANS) += ans.h + +ifeq ($(CONFIG_ENCODERS),yes) +ifeq ($(CONFIG_ANS),yes) +DSP_SRCS-yes += answriter.h +DSP_SRCS-yes += buf_ans.h +DSP_SRCS-yes += buf_ans.c +else ifeq ($(CONFIG_DAALA_EC),yes) +DSP_SRCS-yes += entenc.c +DSP_SRCS-yes += entenc.h +DSP_SRCS-yes += daalaboolwriter.c +DSP_SRCS-yes += daalaboolwriter.h +else +DSP_SRCS-yes += dkboolwriter.h +DSP_SRCS-yes += dkboolwriter.c +endif +DSP_SRCS-yes += bitwriter.h +DSP_SRCS-yes += bitwriter_buffer.c +DSP_SRCS-yes += bitwriter_buffer.h +DSP_SRCS-yes += binary_codes_writer.c +DSP_SRCS-yes += binary_codes_writer.h +DSP_SRCS-yes += psnr.c +DSP_SRCS-yes += psnr.h +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c +DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c +endif + +ifeq ($(CONFIG_DECODERS),yes) +ifeq ($(CONFIG_ANS),yes) +DSP_SRCS-yes += ansreader.h +else ifeq ($(CONFIG_DAALA_EC),yes) +DSP_SRCS-yes += entdec.c +DSP_SRCS-yes += entdec.h +DSP_SRCS-yes += daalaboolreader.c +DSP_SRCS-yes += daalaboolreader.h +else +DSP_SRCS-yes += dkboolreader.h +DSP_SRCS-yes += dkboolreader.c +endif +DSP_SRCS-yes += bitreader.h +DSP_SRCS-yes += bitreader_buffer.c +DSP_SRCS-yes += bitreader_buffer.h +DSP_SRCS-yes += binary_codes_reader.c +DSP_SRCS-yes += binary_codes_reader.h +endif + +# intra predictions +DSP_SRCS-yes += intrapred.c + +ifeq ($(CONFIG_DAALA_EC),yes) +DSP_SRCS-yes += entcode.c +DSP_SRCS-yes += entcode.h +endif + +DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm + +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm +endif # CONFIG_HIGHBITDEPTH + +DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c +DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c +DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred4_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred8_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c + +DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c + +# inter predictions +DSP_SRCS-yes += blend.h +DSP_SRCS-yes += blend_a64_mask.c +DSP_SRCS-yes += blend_a64_hmask.c +DSP_SRCS-yes += blend_a64_vmask.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c + +# interpolation filters +DSP_SRCS-yes += aom_convolve.c +DSP_SRCS-yes += aom_convolve.h +DSP_SRCS-yes += aom_filter.h + +DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h +DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/aom_asm_stubs.c +DSP_SRCS-$(HAVE_SSE2) += x86/aom_subpixel_8t_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/aom_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_bilinear_ssse3.asm +DSP_SRCS-$(HAVE_AVX2) += x86/aom_subpixel_8t_intrin_avx2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_intrin_ssse3.c +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_8t_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c +endif +DSP_SRCS-$(HAVE_SSE2) += x86/aom_convolve_copy_sse2.asm + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM) +DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM) +DSP_SRCS-yes += arm/aom_convolve8_neon_asm$(ASM) +DSP_SRCS-yes += arm/aom_convolve_avg_neon_asm$(ASM) +DSP_SRCS-yes += arm/aom_convolve_neon.c +else +ifeq ($(HAVE_NEON),yes) +DSP_SRCS-yes += arm/aom_convolve_copy_neon.c +DSP_SRCS-yes += arm/aom_convolve8_avg_neon.c +DSP_SRCS-yes += arm/aom_convolve8_neon.c +DSP_SRCS-yes += arm/aom_convolve_avg_neon.c +DSP_SRCS-yes += arm/aom_convolve_neon.c +endif # HAVE_NEON +endif # HAVE_NEON_ASM + +# common (msa) +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_vert_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_horiz_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_vert_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_avg_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_copy_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_msa.h + +# common (dspr2) +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve_common_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_vert_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c + +# loop filters +DSP_SRCS-yes += loopfilter.c + +DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c + +DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/loopfilter_mb_neon$(ASM) +DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM) +DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM) +DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM) +else +ifeq ($(HAVE_NEON),yes) +DSP_SRCS-yes += arm/loopfilter_16_neon.c +DSP_SRCS-yes += arm/loopfilter_8_neon.c +DSP_SRCS-yes += arm/loopfilter_4_neon.c +endif # HAVE_NEON +endif # HAVE_NEON_ASM + +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_msa.h +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_16_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_8_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_4_msa.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_macros_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_masks_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c + +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c +endif # CONFIG_HIGHBITDEPTH + +DSP_SRCS-yes += txfm_common.h +DSP_SRCS-yes += x86/txfm_common_intrin.h +DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h +DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h + +# forward transform +ifneq ($(findstring yes,$(CONFIG_AV1)$(CONFIG_PVQ)),) +DSP_SRCS-$(HAVE_AVX2) += x86/txfm_common_avx2.h +ifeq ($(CONFIG_AV1_ENCODER),yes) +DSP_SRCS-yes += fwd_txfm.c +DSP_SRCS-yes += fwd_txfm.h +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32_8cols_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h +ifeq ($(ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm +endif +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.h +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h +DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c +DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h +DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c +endif # CONFIG_AV1_ENCODER +endif # CONFIG_AV1 + +# inverse transform +ifeq ($(CONFIG_AV1), yes) +DSP_SRCS-yes += inv_txfm.h +DSP_SRCS-yes += inv_txfm.c +DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h +DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/save_reg_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) +DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM) +else +ifeq ($(HAVE_NEON),yes) +DSP_SRCS-yes += arm/idct4x4_1_add_neon.c +DSP_SRCS-yes += arm/idct4x4_add_neon.c +DSP_SRCS-yes += arm/idct8x8_1_add_neon.c +DSP_SRCS-yes += arm/idct8x8_add_neon.c +DSP_SRCS-yes += arm/idct16x16_1_add_neon.c +DSP_SRCS-yes += arm/idct16x16_add_neon.c +DSP_SRCS-yes += arm/idct32x32_1_add_neon.c +DSP_SRCS-yes += arm/idct32x32_add_neon.c +endif # HAVE_NEON +endif # HAVE_NEON_ASM +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c + +DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h +DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/idct16x16_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/idct32x32_msa.c + +ifneq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c +DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c +endif # CONFIG_HIGHBITDEPTH +endif # CONFIG_AV1 + +# quantization +ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),) +DSP_SRCS-yes += quantize.c +DSP_SRCS-yes += quantize.h + +DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c +endif +ifeq ($(ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm +DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm +endif + +# avg +DSP_SRCS-yes += avg.c +DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c +DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c +DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c +DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c +ifeq ($(ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm +endif + +# high bit depth subtract +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subtract_sse2.c +endif + +endif # CONFIG_AV1_ENCODER + +ifeq ($(CONFIG_AV1_ENCODER),yes) +DSP_SRCS-yes += sum_squares.c + +DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c +endif # CONFIG_AV1_ENCODER + +ifeq ($(CONFIG_ENCODERS),yes) +DSP_SRCS-yes += sad.c +DSP_SRCS-yes += subtract.c + +DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c + +DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c + +DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm +DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm +DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm +DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c + +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/sad_highbd_avx2.c +endif + +ifeq ($(CONFIG_AV1_ENCODER),yes) +ifeq ($(CONFIG_EXT_INTER),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c +DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c +endif #CONFIG_EXT_INTER +ifeq ($(CONFIG_MOTION_VAR),yes) +DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c +endif #CONFIG_MOTION_VAR +ifeq ($(CONFIG_EXT_PARTITION),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/sad_impl_avx2.c +endif +endif #CONFIG_AV1_ENCODER + +DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm +DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm + +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +endif # CONFIG_HIGHBITDEPTH + +endif # CONFIG_ENCODERS + +ifneq ($(filter yes,$(CONFIG_ENCODERS)),) +DSP_SRCS-yes += variance.c +DSP_SRCS-yes += variance.h + +DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM) +DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM) +DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c + +DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c +DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c + +DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 +DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_sse2.c +DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c +DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c + +ifeq ($(ARCH_X86_64),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm +endif # ARCH_X86_64 + +DSP_SRCS-$(HAVE_SSE) += x86/subpel_variance_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3 + +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +endif # CONFIG_HIGHBITDEPTH +endif # CONFIG_ENCODERS + +DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) + +DSP_SRCS-yes += aom_dsp_rtcd.c +DSP_SRCS-yes += aom_dsp_rtcd_defs.pl + +DSP_SRCS-yes += aom_simd.h +DSP_SRCS-yes += aom_simd_inline.h +DSP_SRCS-yes += simd/v64_intrinsics.h +DSP_SRCS-yes += simd/v64_intrinsics_c.h +DSP_SRCS-yes += simd/v128_intrinsics.h +DSP_SRCS-yes += simd/v128_intrinsics_c.h +DSP_SRCS-yes += simd/v256_intrinsics.h +DSP_SRCS-yes += simd/v256_intrinsics_c.h +DSP_SRCS-yes += simd/v256_intrinsics_v128.h +DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h +DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h +DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h +DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h +DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h +DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h + +$(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl)) diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h new file mode 100644 index 000000000..47ffbeb6c --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp_common.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_AOM_DSP_COMMON_H_ +#define AOM_DSP_AOM_DSP_COMMON_H_ + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef MAX_SB_SIZE +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +#define MAX_SB_SIZE 128 +#else +#define MAX_SB_SIZE 64 +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION +#endif // ndef MAX_SB_SIZE + +#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y)) +#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y)) + +#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') + +#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0) + +/* Left shifting a negative value became undefined behavior in C99 (downgraded + from merely implementation-defined in C89). This should still compile to the + correct thing on any two's-complement machine, but avoid ubsan warnings.*/ +#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift))) + +// These can be used to give a hint about branch outcomes. +// This can have an effect, even if your target processor has a +// good branch predictor, as these hints can affect basic block +// ordering by the compiler. +#ifdef __GNUC__ +#define LIKELY(v) __builtin_expect(v, 1) +#define UNLIKELY(v) __builtin_expect(v, 0) +#else +#define LIKELY(v) (v) +#define UNLIKELY(v) (v) +#endif + +#define AOM_SWAP(type, a, b) \ + do { \ + type c = (b); \ + b = a; \ + a = c; \ + } while (0) + +#if CONFIG_AOM_QM +typedef uint16_t qm_val_t; +#define AOM_QM_BITS 6 +#endif +#if CONFIG_HIGHBITDEPTH +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +typedef int64_t tran_high_t; +typedef int32_t tran_low_t; +#else +// Note: +// tran_low_t is the datatype used for final transform coefficients. +// tran_high_t is the datatype used for intermediate transform stages. +typedef int32_t tran_high_t; +typedef int16_t tran_low_t; +#endif // CONFIG_HIGHBITDEPTH + +static INLINE uint8_t clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} + +static INLINE int clamp(int value, int low, int high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE double fclamp(double value, double low, double high) { + return value < low ? low : (value > high ? high : value); +} + +#if CONFIG_HIGHBITDEPTH +static INLINE uint16_t clip_pixel_highbd(int val, int bd) { + switch (bd) { + case 8: + default: return (uint16_t)clamp(val, 0, 255); + case 10: return (uint16_t)clamp(val, 0, 1023); + case 12: return (uint16_t)clamp(val, 0, 4095); + } +} +#endif // CONFIG_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_AOM_DSP_COMMON_H_ diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c new file mode 100644 index 000000000..11a57d382 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "./aom_config.h" +#define RTCD_C +#include "./aom_dsp_rtcd.h" +#include "aom_ports/aom_once.h" + +void aom_dsp_rtcd() { once(setup_rtcd_internal); } diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl new file mode 100755 index 000000000..b4ef0d92f --- /dev/null +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -0,0 +1,1495 @@ +sub aom_dsp_forward_decls() { +print < + +#if defined(_WIN32) +#include +#endif + +#include "./aom_config.h" +#include "./aom_simd_inline.h" + +#define SIMD_CHECK 1 // Sanity checks in C equivalents + +#if HAVE_NEON +#include "simd/v256_intrinsics_arm.h" +// VS compiling for 32 bit targets does not support vector types in +// structs as arguments, which makes the v256 type of the intrinsics +// hard to support, so optimizations for this target are disabled. +#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)) +#include "simd/v256_intrinsics_x86.h" +#else +#include "simd/v256_intrinsics.h" +#endif + +#endif // AOM_DSP_AOM_AOM_SIMD_H_ diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h new file mode 100644 index 000000000..02a8b3a17 --- /dev/null +++ b/third_party/aom/aom_dsp/aom_simd_inline.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_AOM_SIMD_INLINE_H_ +#define AOM_DSP_AOM_SIMD_INLINE_H_ + +#include "aom/aom_integer.h" + +#ifndef SIMD_INLINE +#define SIMD_INLINE static AOM_FORCE_INLINE +#endif + +#endif // AOM_DSP_AOM_SIMD_INLINE_H_ diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c new file mode 100644 index 000000000..09429d6d2 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, + int16x4_t dsrc2, int16x4_t dsrc3, + int16x4_t dsrc4, int16x4_t dsrc5, + int16x4_t dsrc6, int16x4_t dsrc7, + int16x8_t q0s16) { + int32x4_t qdst; + int16x4_t d0s16, d1s16; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + + qdst = vmull_lane_s16(dsrc0, d0s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); + qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); + return qdst; +} + +void aom_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, int h) { + int width; + const uint8_t *s; + uint8_t *d; + uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; + uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; + uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + uint16x8x2_t q0x2u16; + uint8x8x2_t d0x2u8, d1x2u8; + uint32x2x2_t d0x2u32; + uint16x4x2_t d0x2u16, d1x2u16; + uint32x4x2_t q0x2u32; + + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + (void)filter_y; + + q0s16 = vld1q_s16(filter_x); + + src -= 3; // adjust for taps + for (; h > 0; h -= 4) { // loop_horiz_v + s = src; + d24u8 = vld1_u8(s); + s += src_stride; + d25u8 = vld1_u8(s); + s += src_stride; + d26u8 = vld1_u8(s); + s += src_stride; + d27u8 = vld1_u8(s); + + q12u8 = vcombine_u8(d24u8, d25u8); + q13u8 = vcombine_u8(d26u8, d27u8); + + q0x2u16 = + vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); + d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); + d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); + d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); + d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); + d0x2u8 = vtrn_u8(d24u8, d25u8); + d1x2u8 = vtrn_u8(d26u8, d27u8); + + __builtin_prefetch(src + src_stride * 4); + __builtin_prefetch(src + src_stride * 5); + + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); + q10u16 = vmovl_u8(d1x2u8.val[0]); + q11u16 = vmovl_u8(d1x2u8.val[1]); + + src += 7; + d16u16 = vget_low_u16(q8u16); + d17u16 = vget_high_u16(q8u16); + d18u16 = vget_low_u16(q9u16); + d19u16 = vget_high_u16(q9u16); + q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 + q9u16 = vcombine_u16(d17u16, d19u16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w; width > 0; width -= 4, src += 4, dst += 4) { // loop_horiz + s = src; + d28u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d29u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d31u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d30u32 = vld1_dup_u32((const uint32_t *)s); + + __builtin_prefetch(src + 64); + + d0x2u16 = + vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); + d1x2u16 = + vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); + d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 + vreinterpret_u8_u16(d1x2u16.val[0])); // d29 + d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 + vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + + __builtin_prefetch(src + 64 + src_stride); + + q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); + q0x2u32 = + vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); + + d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); + d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); + q12u16 = vmovl_u8(d28u8); + q13u16 = vmovl_u8(d29u8); + + __builtin_prefetch(src + 64 + src_stride * 2); + + d = dst; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); + d += dst_stride; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, + d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, + d27s16, d25s16, q0s16); + + __builtin_prefetch(src + 64 + src_stride * 3); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); + d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), + vreinterpret_u32_u16(d0x2u16.val[1])); + d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), + vreinterpret_u8_u32(d0x2u32.val[1])); + + q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); + + q1u8 = vrhaddq_u8(q1u8, q3u8); + + d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); + d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); + + d = dst; + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + + q8u16 = q9u16; + d20s16 = d23s16; + q11u16 = q12u16; + q9u16 = q13u16; + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + } + src += src_stride * 4 - w - 7; + dst += dst_stride * 4 - w; + } + return; +} + +void aom_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int height; + const uint8_t *s; + uint8_t *d; + uint8x8_t d2u8, d3u8; + uint32x2_t d2u32, d3u32, d6u32, d7u32; + uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; + uint8x16_t q1u8, q3u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + (void)filter_x; + + src -= src_stride * 3; + q0s16 = vld1q_s16(filter_y); + for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h + s = src; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); + s += src_stride; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); + s += src_stride; + d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); + s += src_stride; + + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); + q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d = dst; + for (height = h; height > 0; height -= 4) { // loop_vert + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s += src_stride; + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s += src_stride; + + q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); + q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); + d += dst_stride; + d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); + d += dst_stride; + d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); + d -= dst_stride * 3; + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + __builtin_prefetch(s); + __builtin_prefetch(s + src_stride); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, q0s16); + __builtin_prefetch(s + src_stride * 2); + __builtin_prefetch(s + src_stride * 3); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, q0s16); + __builtin_prefetch(d); + __builtin_prefetch(d + dst_stride); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, + d26s16, d27s16, q0s16); + __builtin_prefetch(d + dst_stride * 2); + __builtin_prefetch(d + dst_stride * 3); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, + d27s16, d25s16, q0s16); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + q1u8 = vcombine_u8(d2u8, d3u8); + q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); + + q1u8 = vrhaddq_u8(q1u8, q3u8); + + d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); + d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); + + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + d += dst_stride; + + q8u16 = q10u16; + d18s16 = d22s16; + d19s16 = d24s16; + q10u16 = q13u16; + d22s16 = d25s16; + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm new file mode 100644 index 000000000..80aef992d --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm @@ -0,0 +1,295 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + ; These functions are only valid when: + ; x_step_q4 == 16 + ; w%4 == 0 + ; h%4 == 0 + ; taps == 8 + ; AV1_FILTER_WEIGHT == 128 + ; AV1_FILTER_SHIFT == 7 + + EXPORT |aom_convolve8_avg_horiz_neon| + EXPORT |aom_convolve8_avg_vert_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Multiply and accumulate by q0 + MACRO + MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 + vmull.s16 $dst, $src0, d0[0] + vmlal.s16 $dst, $src1, d0[1] + vmlal.s16 $dst, $src2, d0[2] + vmlal.s16 $dst, $src3, d0[3] + vmlal.s16 $dst, $src4, d1[0] + vmlal.s16 $dst, $src5, d1[1] + vmlal.s16 $dst, $src6, d1[2] + vmlal.s16 $dst, $src7, d1[3] + MEND + +; r0 const uint8_t *src +; r1 int src_stride +; r2 uint8_t *dst +; r3 int dst_stride +; sp[]const int16_t *filter_x +; sp[]int x_step_q4 +; sp[]const int16_t *filter_y ; unused +; sp[]int y_step_q4 ; unused +; sp[]int w +; sp[]int h + +|aom_convolve8_avg_horiz_neon| PROC + push {r4-r10, lr} + + sub r0, r0, #3 ; adjust for taps + + ldr r5, [sp, #32] ; filter_x + ldr r6, [sp, #48] ; w + ldr r7, [sp, #52] ; h + + vld1.s16 {q0}, [r5] ; filter_x + + sub r8, r1, r1, lsl #2 ; -src_stride * 3 + add r8, r8, #4 ; -src_stride * 3 + 4 + + sub r4, r3, r3, lsl #2 ; -dst_stride * 3 + add r4, r4, #4 ; -dst_stride * 3 + 4 + + rsb r9, r6, r1, lsl #2 ; reset src for outer loop + sub r9, r9, #7 + rsb r12, r6, r3, lsl #2 ; reset dst for outer loop + + mov r10, r6 ; w loop counter + +aom_convolve8_avg_loop_horiz_v + vld1.8 {d24}, [r0], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d27}, [r0], r8 + + vtrn.16 q12, q13 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + + pld [r0, r1, lsl #2] + + vmovl.u8 q8, d24 + vmovl.u8 q9, d25 + vmovl.u8 q10, d26 + vmovl.u8 q11, d27 + + ; save a few instructions in the inner loop + vswp d17, d18 + vmov d23, d21 + + add r0, r0, #3 + +aom_convolve8_avg_loop_horiz + add r5, r0, #64 + + vld1.32 {d28[]}, [r0], r1 + vld1.32 {d29[]}, [r0], r1 + vld1.32 {d31[]}, [r0], r1 + vld1.32 {d30[]}, [r0], r8 + + pld [r5] + + vtrn.16 d28, d31 + vtrn.16 d29, d30 + vtrn.8 d28, d29 + vtrn.8 d31, d30 + + pld [r5, r1] + + ; extract to s16 + vtrn.32 q14, q15 + vmovl.u8 q12, d28 + vmovl.u8 q13, d29 + + pld [r5, r1, lsl #1] + + ; slightly out of order load to match the existing data + vld1.u32 {d6[0]}, [r2], r3 + vld1.u32 {d7[0]}, [r2], r3 + vld1.u32 {d6[1]}, [r2], r3 + vld1.u32 {d7[1]}, [r2], r3 + + sub r2, r2, r3, lsl #2 ; reset for store + + ; src[] * filter_x + MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 + MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 + MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 + MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 + + pld [r5, -r8] + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 + + ; transpose + vtrn.16 d2, d3 + vtrn.32 d2, d3 + vtrn.8 d2, d3 + + ; average the new value and the dst value + vrhadd.u8 q1, q1, q3 + + vst1.u32 {d2[0]}, [r2@32], r3 + vst1.u32 {d3[0]}, [r2@32], r3 + vst1.u32 {d2[1]}, [r2@32], r3 + vst1.u32 {d3[1]}, [r2@32], r4 + + vmov q8, q9 + vmov d20, d23 + vmov q11, q12 + vmov q9, q13 + + subs r6, r6, #4 ; w -= 4 + bgt aom_convolve8_avg_loop_horiz + + ; outer loop + mov r6, r10 ; restore w counter + add r0, r0, r9 ; src += src_stride * 4 - w + add r2, r2, r12 ; dst += dst_stride * 4 - w + subs r7, r7, #4 ; h -= 4 + bgt aom_convolve8_avg_loop_horiz_v + + pop {r4-r10, pc} + + ENDP + +|aom_convolve8_avg_vert_neon| PROC + push {r4-r8, lr} + + ; adjust for taps + sub r0, r0, r1 + sub r0, r0, r1, lsl #1 + + ldr r4, [sp, #32] ; filter_y + ldr r6, [sp, #40] ; w + ldr lr, [sp, #44] ; h + + vld1.s16 {q0}, [r4] ; filter_y + + lsl r1, r1, #1 + lsl r3, r3, #1 + +aom_convolve8_avg_loop_vert_h + mov r4, r0 + add r7, r0, r1, asr #1 + mov r5, r2 + add r8, r2, r3, asr #1 + mov r12, lr ; h loop counter + + vld1.u32 {d16[0]}, [r4], r1 + vld1.u32 {d16[1]}, [r7], r1 + vld1.u32 {d18[0]}, [r4], r1 + vld1.u32 {d18[1]}, [r7], r1 + vld1.u32 {d20[0]}, [r4], r1 + vld1.u32 {d20[1]}, [r7], r1 + vld1.u32 {d22[0]}, [r4], r1 + + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + +aom_convolve8_avg_loop_vert + ; always process a 4x4 block at a time + vld1.u32 {d24[0]}, [r7], r1 + vld1.u32 {d26[0]}, [r4], r1 + vld1.u32 {d26[1]}, [r7], r1 + vld1.u32 {d24[1]}, [r4], r1 + + ; extract to s16 + vmovl.u8 q12, d24 + vmovl.u8 q13, d26 + + vld1.u32 {d6[0]}, [r5@32], r3 + vld1.u32 {d6[1]}, [r8@32], r3 + vld1.u32 {d7[0]}, [r5@32], r3 + vld1.u32 {d7[1]}, [r8@32], r3 + + pld [r7] + pld [r4] + + ; src[] * filter_y + MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 + + pld [r7, r1] + pld [r4, r1] + + MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 + + pld [r5] + pld [r8] + + MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 + + pld [r5, r3] + pld [r8, r3] + + MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 + + ; average the new value and the dst value + vrhadd.u8 q1, q1, q3 + + sub r5, r5, r3, lsl #1 ; reset for store + sub r8, r8, r3, lsl #1 + + vst1.u32 {d2[0]}, [r5@32], r3 + vst1.u32 {d2[1]}, [r8@32], r3 + vst1.u32 {d3[0]}, [r5@32], r3 + vst1.u32 {d3[1]}, [r8@32], r3 + + vmov q8, q10 + vmov d18, d22 + vmov d19, d24 + vmov q10, q13 + vmov d22, d25 + + subs r12, r12, #4 ; h -= 4 + bgt aom_convolve8_avg_loop_vert + + ; outer loop + add r0, r0, #4 + add r2, r2, #4 + subs r6, r6, #4 ; w -= 4 + bgt aom_convolve8_avg_loop_vert_h + + pop {r4-r8, pc} + + ENDP + END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c new file mode 100644 index 000000000..8ebffb5f9 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, + int16x4_t dsrc2, int16x4_t dsrc3, + int16x4_t dsrc4, int16x4_t dsrc5, + int16x4_t dsrc6, int16x4_t dsrc7, + int16x8_t q0s16) { + int32x4_t qdst; + int16x4_t d0s16, d1s16; + + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + + qdst = vmull_lane_s16(dsrc0, d0s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); + qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); + qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); + qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); + qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); + return qdst; +} + +void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, int h) { + int width; + const uint8_t *s, *psrc; + uint8_t *d, *pdst; + uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; + uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; + uint8x16_t q12u8, q13u8, q14u8, q15u8; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + uint16x8x2_t q0x2u16; + uint8x8x2_t d0x2u8, d1x2u8; + uint32x2x2_t d0x2u32; + uint16x4x2_t d0x2u16, d1x2u16; + uint32x4x2_t q0x2u32; + + assert(x_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + (void)filter_y; + + q0s16 = vld1q_s16(filter_x); + + src -= 3; // adjust for taps + for (; h > 0; h -= 4, src += src_stride * 4, + dst += dst_stride * 4) { // loop_horiz_v + s = src; + d24u8 = vld1_u8(s); + s += src_stride; + d25u8 = vld1_u8(s); + s += src_stride; + d26u8 = vld1_u8(s); + s += src_stride; + d27u8 = vld1_u8(s); + + q12u8 = vcombine_u8(d24u8, d25u8); + q13u8 = vcombine_u8(d26u8, d27u8); + + q0x2u16 = + vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); + d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); + d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); + d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); + d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); + d0x2u8 = vtrn_u8(d24u8, d25u8); + d1x2u8 = vtrn_u8(d26u8, d27u8); + + __builtin_prefetch(src + src_stride * 4); + __builtin_prefetch(src + src_stride * 5); + __builtin_prefetch(src + src_stride * 6); + + q8u16 = vmovl_u8(d0x2u8.val[0]); + q9u16 = vmovl_u8(d0x2u8.val[1]); + q10u16 = vmovl_u8(d1x2u8.val[0]); + q11u16 = vmovl_u8(d1x2u8.val[1]); + + d16u16 = vget_low_u16(q8u16); + d17u16 = vget_high_u16(q8u16); + d18u16 = vget_low_u16(q9u16); + d19u16 = vget_high_u16(q9u16); + q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 + q9u16 = vcombine_u16(d17u16, d19u16); + + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 + for (width = w, psrc = src + 7, pdst = dst; width > 0; + width -= 4, psrc += 4, pdst += 4) { // loop_horiz + s = psrc; + d28u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d29u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d31u32 = vld1_dup_u32((const uint32_t *)s); + s += src_stride; + d30u32 = vld1_dup_u32((const uint32_t *)s); + + __builtin_prefetch(psrc + 64); + + d0x2u16 = + vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); + d1x2u16 = + vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); + d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 + vreinterpret_u8_u16(d1x2u16.val[0])); // d29 + d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 + vreinterpret_u8_u16(d1x2u16.val[1])); // d30 + + __builtin_prefetch(psrc + 64 + src_stride); + + q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); + q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); + q0x2u32 = + vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); + + d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); + d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); + q12u16 = vmovl_u8(d28u8); + q13u16 = vmovl_u8(d29u8); + + __builtin_prefetch(psrc + 64 + src_stride * 2); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, + d23s16, d24s16, q0s16); + q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, + d24s16, d26s16, q0s16); + q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, + d26s16, d27s16, q0s16); + q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, + d27s16, d25s16, q0s16); + + __builtin_prefetch(psrc + 60 + src_stride * 3); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u8 = vqmovn_u16(q1u16); + d3u8 = vqmovn_u16(q2u16); + + d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); + d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), + vreinterpret_u32_u16(d0x2u16.val[1])); + d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), + vreinterpret_u8_u32(d0x2u32.val[1])); + + d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); + d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); + + d = pdst; + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + + q8u16 = q9u16; + d20s16 = d23s16; + q11u16 = q12u16; + q9u16 = q13u16; + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + } + } + return; +} + +void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int height; + const uint8_t *s; + uint8_t *d; + uint32x2_t d2u32, d3u32; + uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; + int16x4_t d24s16, d25s16, d26s16, d27s16; + uint16x4_t d2u16, d3u16, d4u16, d5u16; + int16x8_t q0s16; + uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; + int32x4_t q1s32, q2s32, q14s32, q15s32; + + assert(y_step_q4 == 16); + + (void)x_step_q4; + (void)y_step_q4; + (void)filter_x; + + src -= src_stride * 3; + q0s16 = vld1q_s16(filter_y); + for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h + s = src; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); + s += src_stride; + d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); + s += src_stride; + d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); + s += src_stride; + d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); + s += src_stride; + d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); + s += src_stride; + + q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); + q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); + q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); + q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); + + d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); + d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d = dst; + for (height = h; height > 0; height -= 4) { // loop_vert + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); + s += src_stride; + d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); + s += src_stride; + d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); + s += src_stride; + + q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); + q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); + + d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); + d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); + d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); + d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + + __builtin_prefetch(d); + __builtin_prefetch(d + dst_stride); + q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, + d22s16, d24s16, q0s16); + __builtin_prefetch(d + dst_stride * 2); + __builtin_prefetch(d + dst_stride * 3); + q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, + d24s16, d26s16, q0s16); + __builtin_prefetch(s); + __builtin_prefetch(s + src_stride); + q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, + d26s16, d27s16, q0s16); + __builtin_prefetch(s + src_stride * 2); + __builtin_prefetch(s + src_stride * 3); + q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, + d27s16, d25s16, q0s16); + + d2u16 = vqrshrun_n_s32(q1s32, 7); + d3u16 = vqrshrun_n_s32(q2s32, 7); + d4u16 = vqrshrun_n_s32(q14s32, 7); + d5u16 = vqrshrun_n_s32(q15s32, 7); + + q1u16 = vcombine_u16(d2u16, d3u16); + q2u16 = vcombine_u16(d4u16, d5u16); + + d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); + d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); + + vst1_lane_u32((uint32_t *)d, d2u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d2u32, 1); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 0); + d += dst_stride; + vst1_lane_u32((uint32_t *)d, d3u32, 1); + d += dst_stride; + + q8u16 = q10u16; + d18s16 = d22s16; + d19s16 = d24s16; + q10u16 = q13u16; + d22s16 = d25s16; + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm new file mode 100644 index 000000000..38207d864 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm @@ -0,0 +1,273 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + ; These functions are only valid when: + ; x_step_q4 == 16 + ; w%4 == 0 + ; h%4 == 0 + ; taps == 8 + ; AV1_FILTER_WEIGHT == 128 + ; AV1_FILTER_SHIFT == 7 + + EXPORT |aom_convolve8_horiz_neon| + EXPORT |aom_convolve8_vert_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Multiply and accumulate by q0 + MACRO + MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 + vmull.s16 $dst, $src0, d0[0] + vmlal.s16 $dst, $src1, d0[1] + vmlal.s16 $dst, $src2, d0[2] + vmlal.s16 $dst, $src3, d0[3] + vmlal.s16 $dst, $src4, d1[0] + vmlal.s16 $dst, $src5, d1[1] + vmlal.s16 $dst, $src6, d1[2] + vmlal.s16 $dst, $src7, d1[3] + MEND + +; r0 const uint8_t *src +; r1 int src_stride +; r2 uint8_t *dst +; r3 int dst_stride +; sp[]const int16_t *filter_x +; sp[]int x_step_q4 +; sp[]const int16_t *filter_y ; unused +; sp[]int y_step_q4 ; unused +; sp[]int w +; sp[]int h + +|aom_convolve8_horiz_neon| PROC + push {r4-r10, lr} + + sub r0, r0, #3 ; adjust for taps + + ldr r5, [sp, #32] ; filter_x + ldr r6, [sp, #48] ; w + ldr r7, [sp, #52] ; h + + vld1.s16 {q0}, [r5] ; filter_x + + sub r8, r1, r1, lsl #2 ; -src_stride * 3 + add r8, r8, #4 ; -src_stride * 3 + 4 + + sub r4, r3, r3, lsl #2 ; -dst_stride * 3 + add r4, r4, #4 ; -dst_stride * 3 + 4 + + rsb r9, r6, r1, lsl #2 ; reset src for outer loop + sub r9, r9, #7 + rsb r12, r6, r3, lsl #2 ; reset dst for outer loop + + mov r10, r6 ; w loop counter + +aom_convolve8_loop_horiz_v + vld1.8 {d24}, [r0], r1 + vld1.8 {d25}, [r0], r1 + vld1.8 {d26}, [r0], r1 + vld1.8 {d27}, [r0], r8 + + vtrn.16 q12, q13 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + + pld [r0, r1, lsl #2] + + vmovl.u8 q8, d24 + vmovl.u8 q9, d25 + vmovl.u8 q10, d26 + vmovl.u8 q11, d27 + + ; save a few instructions in the inner loop + vswp d17, d18 + vmov d23, d21 + + add r0, r0, #3 + +aom_convolve8_loop_horiz + add r5, r0, #64 + + vld1.32 {d28[]}, [r0], r1 + vld1.32 {d29[]}, [r0], r1 + vld1.32 {d31[]}, [r0], r1 + vld1.32 {d30[]}, [r0], r8 + + pld [r5] + + vtrn.16 d28, d31 + vtrn.16 d29, d30 + vtrn.8 d28, d29 + vtrn.8 d31, d30 + + pld [r5, r1] + + ; extract to s16 + vtrn.32 q14, q15 + vmovl.u8 q12, d28 + vmovl.u8 q13, d29 + + pld [r5, r1, lsl #1] + + ; src[] * filter_x + MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 + MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 + MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 + MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 + + pld [r5, -r8] + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 + + ; transpose + vtrn.16 d2, d3 + vtrn.32 d2, d3 + vtrn.8 d2, d3 + + vst1.u32 {d2[0]}, [r2@32], r3 + vst1.u32 {d3[0]}, [r2@32], r3 + vst1.u32 {d2[1]}, [r2@32], r3 + vst1.u32 {d3[1]}, [r2@32], r4 + + vmov q8, q9 + vmov d20, d23 + vmov q11, q12 + vmov q9, q13 + + subs r6, r6, #4 ; w -= 4 + bgt aom_convolve8_loop_horiz + + ; outer loop + mov r6, r10 ; restore w counter + add r0, r0, r9 ; src += src_stride * 4 - w + add r2, r2, r12 ; dst += dst_stride * 4 - w + subs r7, r7, #4 ; h -= 4 + bgt aom_convolve8_loop_horiz_v + + pop {r4-r10, pc} + + ENDP + +|aom_convolve8_vert_neon| PROC + push {r4-r8, lr} + + ; adjust for taps + sub r0, r0, r1 + sub r0, r0, r1, lsl #1 + + ldr r4, [sp, #32] ; filter_y + ldr r6, [sp, #40] ; w + ldr lr, [sp, #44] ; h + + vld1.s16 {q0}, [r4] ; filter_y + + lsl r1, r1, #1 + lsl r3, r3, #1 + +aom_convolve8_loop_vert_h + mov r4, r0 + add r7, r0, r1, asr #1 + mov r5, r2 + add r8, r2, r3, asr #1 + mov r12, lr ; h loop counter + + vld1.u32 {d16[0]}, [r4], r1 + vld1.u32 {d16[1]}, [r7], r1 + vld1.u32 {d18[0]}, [r4], r1 + vld1.u32 {d18[1]}, [r7], r1 + vld1.u32 {d20[0]}, [r4], r1 + vld1.u32 {d20[1]}, [r7], r1 + vld1.u32 {d22[0]}, [r4], r1 + + vmovl.u8 q8, d16 + vmovl.u8 q9, d18 + vmovl.u8 q10, d20 + vmovl.u8 q11, d22 + +aom_convolve8_loop_vert + ; always process a 4x4 block at a time + vld1.u32 {d24[0]}, [r7], r1 + vld1.u32 {d26[0]}, [r4], r1 + vld1.u32 {d26[1]}, [r7], r1 + vld1.u32 {d24[1]}, [r4], r1 + + ; extract to s16 + vmovl.u8 q12, d24 + vmovl.u8 q13, d26 + + pld [r5] + pld [r8] + + ; src[] * filter_y + MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 + + pld [r5, r3] + pld [r8, r3] + + MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 + + pld [r7] + pld [r4] + + MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 + + pld [r7, r1] + pld [r4, r1] + + MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 + + ; += 64 >> 7 + vqrshrun.s32 d2, q1, #7 + vqrshrun.s32 d3, q2, #7 + vqrshrun.s32 d4, q14, #7 + vqrshrun.s32 d5, q15, #7 + + ; saturate + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 + + vst1.u32 {d2[0]}, [r5@32], r3 + vst1.u32 {d2[1]}, [r8@32], r3 + vst1.u32 {d3[0]}, [r5@32], r3 + vst1.u32 {d3[1]}, [r8@32], r3 + + vmov q8, q10 + vmov d18, d22 + vmov d19, d24 + vmov q10, q13 + vmov d22, d25 + + subs r12, r12, #4 ; h -= 4 + bgt aom_convolve8_loop_vert + + ; outer loop + add r0, r0, #4 + add r2, r2, #4 + subs r6, r6, #4 ; w -= 4 + bgt aom_convolve8_loop_vert_h + + pop {r4-r8, pc} + + ENDP + END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c new file mode 100644 index 000000000..f05d3ceae --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +void aom_convolve_avg_neon(const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, int w, + int h) { + uint8_t *d; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint32x2_t d0u32, d2u32; + uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + + d = dst; + if (w > 32) { // avg64 + for (; h > 0; h -= 1) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); + q10u8 = vld1q_u8(d + 32); + q11u8 = vld1q_u8(d + 48); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q8u8); + q1u8 = vrhaddq_u8(q1u8, q9u8); + q2u8 = vrhaddq_u8(q2u8, q10u8); + q3u8 = vrhaddq_u8(q3u8, q11u8); + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // avg32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + q8u8 = vld1q_u8(d); + q9u8 = vld1q_u8(d + 16); + d += dst_stride; + q10u8 = vld1q_u8(d); + q11u8 = vld1q_u8(d + 16); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q8u8); + q1u8 = vrhaddq_u8(q1u8, q9u8); + q2u8 = vrhaddq_u8(q2u8, q10u8); + q3u8 = vrhaddq_u8(q3u8, q11u8); + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // avg16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + q2u8 = vld1q_u8(d); + d += dst_stride; + q3u8 = vld1q_u8(d); + d += dst_stride; + + q0u8 = vrhaddq_u8(q0u8, q2u8); + q1u8 = vrhaddq_u8(q1u8, q3u8); + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // avg8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d1u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(d); + d += dst_stride; + d3u8 = vld1_u8(d); + d += dst_stride; + + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + q0u8 = vrhaddq_u8(q0u8, q1u8); + + vst1_u8(dst, vget_low_u8(q0u8)); + dst += dst_stride; + vst1_u8(dst, vget_high_u8(q0u8)); + dst += dst_stride; + } + } else { // avg4 + for (; h > 0; h -= 2) { + d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); + src += src_stride; + d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); + src += src_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); + d += dst_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); + d += dst_stride; + + d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32)); + + d0u32 = vreinterpret_u32_u8(d0u8); + vst1_lane_u32((uint32_t *)dst, d0u32, 0); + dst += dst_stride; + vst1_lane_u32((uint32_t *)dst, d0u32, 1); + dst += dst_stride; + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm new file mode 100644 index 000000000..43c300954 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm @@ -0,0 +1,119 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_convolve_avg_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|aom_convolve_avg_neon| PROC + push {r4-r6, lr} + ldrd r4, r5, [sp, #32] + mov r6, r2 + + cmp r4, #32 + bgt avg64 + beq avg32 + cmp r4, #8 + bgt avg16 + beq avg8 + b avg4 + +avg64 + sub lr, r1, #32 + sub r4, r3, #32 +avg64_h + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0]! + vld1.8 {q2-q3}, [r0], lr + pld [r2, r3] + vld1.8 {q8-q9}, [r6@128]! + vld1.8 {q10-q11}, [r6@128], r4 + vrhadd.u8 q0, q0, q8 + vrhadd.u8 q1, q1, q9 + vrhadd.u8 q2, q2, q10 + vrhadd.u8 q3, q3, q11 + vst1.8 {q0-q1}, [r2@128]! + vst1.8 {q2-q3}, [r2@128], r4 + subs r5, r5, #1 + bgt avg64_h + pop {r4-r6, pc} + +avg32 + vld1.8 {q0-q1}, [r0], r1 + vld1.8 {q2-q3}, [r0], r1 + vld1.8 {q8-q9}, [r6@128], r3 + vld1.8 {q10-q11}, [r6@128], r3 + pld [r0] + vrhadd.u8 q0, q0, q8 + pld [r0, r1] + vrhadd.u8 q1, q1, q9 + pld [r6] + vrhadd.u8 q2, q2, q10 + pld [r6, r3] + vrhadd.u8 q3, q3, q11 + vst1.8 {q0-q1}, [r2@128], r3 + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #2 + bgt avg32 + pop {r4-r6, pc} + +avg16 + vld1.8 {q0}, [r0], r1 + vld1.8 {q1}, [r0], r1 + vld1.8 {q2}, [r6@128], r3 + vld1.8 {q3}, [r6@128], r3 + pld [r0] + pld [r0, r1] + vrhadd.u8 q0, q0, q2 + pld [r6] + pld [r6, r3] + vrhadd.u8 q1, q1, q3 + vst1.8 {q0}, [r2@128], r3 + vst1.8 {q1}, [r2@128], r3 + subs r5, r5, #2 + bgt avg16 + pop {r4-r6, pc} + +avg8 + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r0], r1 + vld1.8 {d2}, [r6@64], r3 + vld1.8 {d3}, [r6@64], r3 + pld [r0] + pld [r0, r1] + vrhadd.u8 q0, q0, q1 + pld [r6] + pld [r6, r3] + vst1.8 {d0}, [r2@64], r3 + vst1.8 {d1}, [r2@64], r3 + subs r5, r5, #2 + bgt avg8 + pop {r4-r6, pc} + +avg4 + vld1.32 {d0[0]}, [r0], r1 + vld1.32 {d0[1]}, [r0], r1 + vld1.32 {d2[0]}, [r6@32], r3 + vld1.32 {d2[1]}, [r6@32], r3 + vrhadd.u8 d0, d0, d2 + vst1.32 {d0[0]}, [r2@32], r3 + vst1.32 {d0[1]}, [r2@32], r3 + subs r5, r5, #2 + bgt avg4 + pop {r4-r6, pc} + ENDP + + END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c new file mode 100644 index 000000000..9e57c7176 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +void aom_convolve_copy_neon(const uint8_t *src, // r0 + ptrdiff_t src_stride, // r1 + uint8_t *dst, // r2 + ptrdiff_t dst_stride, // r3 + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, int w, + int h) { + uint8x8_t d0u8, d2u8; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + + if (w > 32) { // copy64 + for (; h > 0; h--) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + q2u8 = vld1q_u8(src + 32); + q3u8 = vld1q_u8(src + 48); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + vst1q_u8(dst + 32, q2u8); + vst1q_u8(dst + 48, q3u8); + dst += dst_stride; + } + } else if (w == 32) { // copy32 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + q1u8 = vld1q_u8(src + 16); + src += src_stride; + q2u8 = vld1q_u8(src); + q3u8 = vld1q_u8(src + 16); + src += src_stride; + + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + dst += dst_stride; + vst1q_u8(dst, q2u8); + vst1q_u8(dst + 16, q3u8); + dst += dst_stride; + } + } else if (w > 8) { // copy16 + for (; h > 0; h -= 2) { + q0u8 = vld1q_u8(src); + src += src_stride; + q1u8 = vld1q_u8(src); + src += src_stride; + + vst1q_u8(dst, q0u8); + dst += dst_stride; + vst1q_u8(dst, q1u8); + dst += dst_stride; + } + } else if (w == 8) { // copy8 + for (; h > 0; h -= 2) { + d0u8 = vld1_u8(src); + src += src_stride; + d2u8 = vld1_u8(src); + src += src_stride; + + vst1_u8(dst, d0u8); + dst += dst_stride; + vst1_u8(dst, d2u8); + dst += dst_stride; + } + } else { // copy4 + for (; h > 0; h--) { + *(uint32_t *)dst = *(const uint32_t *)src; + src += src_stride; + dst += dst_stride; + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm new file mode 100644 index 000000000..443d7178a --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm @@ -0,0 +1,87 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_convolve_copy_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|aom_convolve_copy_neon| PROC + push {r4-r5, lr} + ldrd r4, r5, [sp, #28] + + cmp r4, #32 + bgt copy64 + beq copy32 + cmp r4, #8 + bgt copy16 + beq copy8 + b copy4 + +copy64 + sub lr, r1, #32 + sub r3, r3, #32 +copy64_h + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0]! + vld1.8 {q2-q3}, [r0], lr + vst1.8 {q0-q1}, [r2@128]! + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #1 + bgt copy64_h + pop {r4-r5, pc} + +copy32 + pld [r0, r1, lsl #1] + vld1.8 {q0-q1}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {q2-q3}, [r0], r1 + vst1.8 {q0-q1}, [r2@128], r3 + vst1.8 {q2-q3}, [r2@128], r3 + subs r5, r5, #2 + bgt copy32 + pop {r4-r5, pc} + +copy16 + pld [r0, r1, lsl #1] + vld1.8 {q0}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {q1}, [r0], r1 + vst1.8 {q0}, [r2@128], r3 + vst1.8 {q1}, [r2@128], r3 + subs r5, r5, #2 + bgt copy16 + pop {r4-r5, pc} + +copy8 + pld [r0, r1, lsl #1] + vld1.8 {d0}, [r0], r1 + pld [r0, r1, lsl #1] + vld1.8 {d2}, [r0], r1 + vst1.8 {d0}, [r2@64], r3 + vst1.8 {d2}, [r2@64], r3 + subs r5, r5, #2 + bgt copy8 + pop {r4-r5, pc} + +copy4 + ldr r12, [r0], r1 + str r12, [r2], r3 + subs r5, r5, #1 + bgt copy4 + pop {r4-r5, pc} + ENDP + + END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_neon.c new file mode 100644 index 000000000..6c2997e04 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/aom_convolve_neon.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the + * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). + */ + DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); + + // Account for the vertical phase needing 3 lines prior and 4 lines post + int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* Filter starting 3 lines back. The neon implementation will ignore the + * given height and filter a multiple of 4 lines. Since this goes in to + * the temp buffer which has lots of extra room and is subsequently discarded + * this is safe if somewhat less than ideal. + */ + aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, + intermediate_height); + + /* Step into the temp buffer 3 lines to get the actual frame data */ + aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); + int intermediate_height = h + 7; + + assert(y_step_q4 == 16); + assert(x_step_q4 == 16); + + /* This implementation has the same issues as above. In addition, we only want + * to average the values after both passes. + */ + aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, + intermediate_height); + aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c new file mode 100644 index 000000000..e730ccbcc --- /dev/null +++ b/third_party/aom/aom_dsp/arm/avg_neon.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" + +#include "aom/aom_integer.h" + +static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { + const uint32x4_t a = vpaddlq_u16(v_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int aom_avg_4x4_neon(const uint8_t *s, int p) { + uint16x8_t v_sum; + uint32x2_t v_s0 = vdup_n_u32(0); + uint32x2_t v_s1 = vdup_n_u32(0); + v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0); + v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1); + v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); + v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); + v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); + return (horizontal_add_u16x8(v_sum) + 8) >> 4; +} + +unsigned int aom_avg_8x8_neon(const uint8_t *s, int p) { + uint8x8_t v_s0 = vld1_u8(s); + const uint8x8_t v_s1 = vld1_u8(s + p); + uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); + + v_s0 = vld1_u8(s + 2 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 3 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 4 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 5 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 6 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + v_s0 = vld1_u8(s + 7 * p); + v_sum = vaddw_u8(v_sum, v_s0); + + return (horizontal_add_u16x8(v_sum) + 32) >> 6; +} + +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. +int aom_satd_neon(const int16_t *coeff, int length) { + const int16x4_t zero = vdup_n_s16(0); + int32x4_t accum = vdupq_n_s32(0); + + do { + const int16x8_t src0 = vld1q_s16(coeff); + const int16x8_t src8 = vld1q_s16(coeff + 8); + accum = vabal_s16(accum, vget_low_s16(src0), zero); + accum = vabal_s16(accum, vget_high_s16(src0), zero); + accum = vabal_s16(accum, vget_low_s16(src8), zero); + accum = vabal_s16(accum, vget_high_s16(src8), zero); + length -= 16; + coeff += 16; + } while (length != 0); + + { + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. + const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), + vreinterpret_s32_s64(vget_high_s64(s0))); + const int satd = vget_lane_s32(s1, 0); + return satd; + } +} + +void aom_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, int ref_stride, + int height) { + int i; + uint16x8_t vec_sum_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_hi = vdupq_n_u16(0); + const int shift_factor = ((height >> 5) + 3) * -1; + const int16x8_t vec_shift = vdupq_n_s16(shift_factor); + + for (i = 0; i < height; i += 8) { + const uint8x16_t vec_row1 = vld1q_u8(ref); + const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); + const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); + const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); + const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); + const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); + const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); + const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); + + vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); + vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); + + ref += ref_stride * 8; + } + + vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); + vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); + + vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); + hbuf += 8; + vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); +} + +int16_t aom_int_pro_col_neon(uint8_t const *ref, const int width) { + int i; + uint16x8_t vec_sum = vdupq_n_u16(0); + + for (i = 0; i < width; i += 16) { + const uint8x16_t vec_row = vld1q_u8(ref); + vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); + vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); + ref += 16; + } + + return horizontal_add_u16x8(vec_sum); +} + +// ref, src = [0, 510] - max diff = 16-bits +// bwl = {2, 3, 4}, width = {16, 32, 64} +int aom_vector_var_neon(int16_t const *ref, int16_t const *src, int bwl) { + int width = 4 << bwl; + int32x4_t sse = vdupq_n_s32(0); + int16x8_t total = vdupq_n_s16(0); + + assert(width >= 8); + assert((width % 8) == 0); + + do { + const int16x8_t r = vld1q_s16(ref); + const int16x8_t s = vld1q_s16(src); + const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. + const int16x4_t diff_lo = vget_low_s16(diff); + const int16x4_t diff_hi = vget_high_s16(diff); + sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. + sse = vmlal_s16(sse, diff_hi, diff_hi); + total = vaddq_s16(total, diff); // dynamic range 16 bits. + + ref += 8; + src += 8; + width -= 8; + } while (width != 0); + + { + // Note: 'total''s pairwise addition could be implemented similarly to + // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired + // with the summation of 'sse' performed better on a Cortex-A15. + const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' + const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); + const int32x2_t t2 = vpadd_s32(t1, t1); + const int t = vget_lane_s32(t2, 0); + const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. + const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), + vreinterpret_s32_s64(vget_high_s64(s0))); + const int s = vget_lane_s32(s1, 0); + const int shift_factor = bwl + 2; + return s - ((t * t) >> shift_factor); + } +} + +void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int *min, int *max) { + // Load and concatenate. + const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride)); + const uint8x16_t a23 = + vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride)); + const uint8x16_t a45 = + vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride)); + const uint8x16_t a67 = + vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride)); + + const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride)); + const uint8x16_t b23 = + vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride)); + const uint8x16_t b45 = + vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride)); + const uint8x16_t b67 = + vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride)); + + // Absolute difference. + const uint8x16_t ab01_diff = vabdq_u8(a01, b01); + const uint8x16_t ab23_diff = vabdq_u8(a23, b23); + const uint8x16_t ab45_diff = vabdq_u8(a45, b45); + const uint8x16_t ab67_diff = vabdq_u8(a67, b67); + + // Max values between the Q vectors. + const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff); + const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff); + const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff); + const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff); + + const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); + const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); + + // Split to D and start doing pairwise. + uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); + uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); + + // Enough runs of vpmax/min propogate the max/min values to every position. + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + ab_max = vpmax_u8(ab_max, ab_max); + ab_min = vpmin_u8(ab_min, ab_min); + + *min = *max = 0; // Clear high bits + // Store directly to avoid costly neon->gpr transfer. + vst1_lane_u8((uint8_t *)max, ab_max, 0); + vst1_lane_u8((uint8_t *)min, ab_min, 0); +} diff --git a/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm b/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm new file mode 100644 index 000000000..17b7d25f9 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm @@ -0,0 +1,240 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_filter_block2d_bil_first_pass_media| + EXPORT |aom_filter_block2d_bil_second_pass_media| + + AREA |.text|, CODE, READONLY ; name this block of code + +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 unsigned short *dst_ptr, +; r2 unsigned int src_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *aom_filter +;------------------------------------- +; The output is transposed stroed in output array to make it easy for second pass filtering. +|aom_filter_block2d_bil_first_pass_media| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; aom_filter address + ldr r4, [sp, #36] ; width + + mov r12, r3 ; outer-loop counter + + add r7, r2, r4 ; preload next row + pld [r0, r7] + + sub r2, r2, r4 ; src increment for height loop + + ldr r5, [r11] ; load up filter coefficients + + mov r3, r3, lsl #1 ; height*2 + add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) + + mov r11, r1 ; save dst_ptr for each row + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_1st_filter + +|bil_height_loop_1st_v6| + ldrb r6, [r0] ; load source data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + mov lr, r4, lsr #2 ; 4-in-parellel loop counter + +|bil_width_loop_1st_v6| + ldrb r9, [r0, #3] + ldrb r10, [r0, #4] + + pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] + + smuad r6, r6, r5 ; apply the filter + pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] + smuad r7, r7, r5 + pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] + + smuad r8, r8, r5 + smuad r9, r9, r5 + + add r0, r0, #4 + subs lr, lr, #1 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #16, r6, asr #7 + usat r7, #16, r7, asr #7 + + strh r6, [r1], r3 ; result is transposed and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strh r7, [r1], r3 + add r9, r9, #0x40 + usat r8, #16, r8, asr #7 + usat r9, #16, r9, asr #7 + + strh r8, [r1], r3 ; result is transposed and stored + + ldrneb r6, [r0] ; load source data + strh r9, [r1], r3 + + ldrneb r7, [r0, #1] + ldrneb r8, [r0, #2] + + bne bil_width_loop_1st_v6 + + add r0, r0, r2 ; move to next input row + subs r12, r12, #1 + + add r9, r2, r4, lsl #1 ; adding back block width + pld [r0, r9] ; preload next row + + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_1st_v6 + + ldmia sp!, {r4 - r11, pc} + +|bil_null_1st_filter| +|bil_height_loop_null_1st| + mov lr, r4, lsr #2 ; loop counter + +|bil_width_loop_null_1st| + ldrb r6, [r0] ; load data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + ldrb r9, [r0, #3] + + strh r6, [r1], r3 ; store it to immediate buffer + add r0, r0, #4 + strh r7, [r1], r3 + subs lr, lr, #1 + strh r8, [r1], r3 + strh r9, [r1], r3 + + bne bil_width_loop_null_1st + + subs r12, r12, #1 + add r0, r0, r2 ; move to next input line + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_null_1st + + ldmia sp!, {r4 - r11, pc} + + ENDP ; |aom_filter_block2d_bil_first_pass_media| + + +;--------------------------------- +; r0 unsigned short *src_ptr, +; r1 unsigned char *dst_ptr, +; r2 int dst_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *aom_filter +;--------------------------------- +|aom_filter_block2d_bil_second_pass_media| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; aom_filter address + ldr r4, [sp, #36] ; width + + ldr r5, [r11] ; load up filter coefficients + mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix + mov r11, r1 + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_2nd_filter + +|bil_height_loop_2nd| + ldr r6, [r0] ; load the data + ldr r8, [r0, #4] + ldrh r10, [r0, #8] + mov lr, r3, lsr #2 ; loop counter + +|bil_width_loop_2nd| + pkhtb r7, r6, r8 ; src[1] | src[2] + pkhtb r9, r8, r10 ; src[3] | src[4] + + smuad r6, r6, r5 ; apply filter + smuad r8, r8, r5 ; apply filter + + subs lr, lr, #1 + + smuadx r7, r7, r5 ; apply filter + smuadx r9, r9, r5 ; apply filter + + add r0, r0, #8 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #8, r6, asr #7 + usat r7, #8, r7, asr #7 + strb r6, [r1], r2 ; the result is transposed back and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strb r7, [r1], r2 + add r9, r9, #0x40 + usat r8, #8, r8, asr #7 + usat r9, #8, r9, asr #7 + strb r8, [r1], r2 ; the result is transposed back and stored + + ldrne r6, [r0] ; load data + strb r9, [r1], r2 + ldrne r8, [r0, #4] + ldrneh r10, [r0, #8] + + bne bil_width_loop_2nd + + subs r12, r12, #1 + add r0, r0, #4 ; update src for next row + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_2nd + ldmia sp!, {r4 - r11, pc} + +|bil_null_2nd_filter| +|bil_height_loop_null_2nd| + mov lr, r3, lsr #2 + +|bil_width_loop_null_2nd| + ldr r6, [r0], #4 ; load data + subs lr, lr, #1 + ldr r8, [r0], #4 + + strb r6, [r1], r2 ; store data + mov r7, r6, lsr #16 + strb r7, [r1], r2 + mov r9, r8, lsr #16 + strb r8, [r1], r2 + strb r9, [r1], r2 + + bne bil_width_loop_null_2nd + + subs r12, r12, #1 + add r0, r0, #4 + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_null_2nd + + ldmia sp!, {r4 - r11, pc} + ENDP ; |aom_filter_block2d_second_pass_media| + + END diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c new file mode 100644 index 000000000..1cf8a3a6e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "aom_dsp/txfm_common.h" + +void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { + int i; + // stage 1 + int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + for (i = 0; i < 2; ++i) { + int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; + const int16x8_t v_s0 = vaddq_s16(input_0, input_7); + const int16x8_t v_s1 = vaddq_s16(input_1, input_6); + const int16x8_t v_s2 = vaddq_s16(input_2, input_5); + const int16x8_t v_s3 = vaddq_s16(input_3, input_4); + const int16x8_t v_s4 = vsubq_s16(input_3, input_4); + const int16x8_t v_s5 = vsubq_s16(input_2, input_5); + const int16x8_t v_s6 = vsubq_s16(input_1, input_6); + const int16x8_t v_s7 = vsubq_s16(input_0, input_7); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } + // transpose 8x8 + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2)); + const int32x4x2_t r13_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3)); + const int32x4x2_t r46_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6)); + const int32x4x2_t r57_s32 = + vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7)); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + input_0 = r01_s16.val[0]; + input_1 = r01_s16.val[1]; + input_2 = r23_s16.val[0]; + input_3 = r23_s16.val[1]; + input_4 = r45_s16.val[0]; + input_5 = r45_s16.val[1]; + input_6 = r67_s16.val[0]; + input_7 = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } // for + { + // from aom_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); + const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); + const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); + const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); + const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); + const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); + const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); + const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); + input_0 = vhsubq_s16(input_0, sign_in0); + input_1 = vhsubq_s16(input_1, sign_in1); + input_2 = vhsubq_s16(input_2, sign_in2); + input_3 = vhsubq_s16(input_3, sign_in3); + input_4 = vhsubq_s16(input_4, sign_in4); + input_5 = vhsubq_s16(input_5, sign_in5); + input_6 = vhsubq_s16(input_6, sign_in6); + input_7 = vhsubq_s16(input_7, sign_in7); + // store results + vst1q_s16(&final_output[0 * 8], input_0); + vst1q_s16(&final_output[1 * 8], input_1); + vst1q_s16(&final_output[2 * 8], input_2); + vst1q_s16(&final_output[3 * 8], input_3); + vst1q_s16(&final_output[4 * 8], input_4); + vst1q_s16(&final_output[5 * 8], input_5); + vst1q_s16(&final_output[6 * 8], input_6); + vst1q_s16(&final_output[7 * 8], input_7); + } +} + +void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + { + const int32x4_t a = vpaddlq_s16(sum); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); + output[1] = 0; + } +} diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c new file mode 100644 index 000000000..9baefae47 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/hadamard_neon.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + const int16x8_t b0 = vaddq_s16(*a0, *a1); + const int16x8_t b1 = vsubq_s16(*a0, *a1); + const int16x8_t b2 = vaddq_s16(*a2, *a3); + const int16x8_t b3 = vsubq_s16(*a2, *a3); + const int16x8_t b4 = vaddq_s16(*a4, *a5); + const int16x8_t b5 = vsubq_s16(*a4, *a5); + const int16x8_t b6 = vaddq_s16(*a6, *a7); + const int16x8_t b7 = vsubq_s16(*a6, *a7); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + const int16x8_t c4 = vaddq_s16(b4, b6); + const int16x8_t c5 = vaddq_s16(b5, b7); + const int16x8_t c6 = vsubq_s16(b4, b6); + const int16x8_t c7 = vsubq_s16(b5, b7); + + *a0 = vaddq_s16(c0, c4); + *a1 = vsubq_s16(c2, c6); + *a2 = vsubq_s16(c0, c4); + *a3 = vaddq_s16(c2, c6); + *a4 = vaddq_s16(c3, c7); + *a5 = vsubq_s16(c3, c7); + *a6 = vsubq_s16(c1, c5); + *a7 = vaddq_s16(c1, c5); +} + +// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider +// reversing transpose order which may make it easier for the compiler to +// reconcile the vtrn.64 moves. +static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + // Swap 64 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 08 09 10 11 12 13 14 15 + // a2: 16 17 18 19 20 21 22 23 + // a3: 24 25 26 27 28 29 30 31 + // a4: 32 33 34 35 36 37 38 39 + // a5: 40 41 42 43 44 45 46 47 + // a6: 48 49 50 51 52 53 54 55 + // a7: 56 57 58 59 60 61 62 63 + // to: + // a04_lo: 00 01 02 03 32 33 34 35 + // a15_lo: 08 09 10 11 40 41 42 43 + // a26_lo: 16 17 18 19 48 49 50 51 + // a37_lo: 24 25 26 27 56 57 58 59 + // a04_hi: 04 05 06 07 36 37 38 39 + // a15_hi: 12 13 14 15 44 45 46 47 + // a26_hi: 20 21 22 23 52 53 54 55 + // a37_hi: 28 29 30 31 60 61 62 63 + const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4)); + const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5)); + const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6)); + const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7)); + const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4)); + const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5)); + const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6)); + const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7)); + + // Swap 32 bit elements resulting in: + // a0246_lo: + // 00 01 16 17 32 33 48 49 + // 02 03 18 19 34 35 50 51 + // a1357_lo: + // 08 09 24 25 40 41 56 57 + // 10 11 26 27 42 43 58 59 + // a0246_hi: + // 04 05 20 21 36 37 52 53 + // 06 07 22 23 38 39 54 55 + // a1657_hi: + // 12 13 28 29 44 45 60 61 + // 14 15 30 31 46 47 62 63 + const int32x4x2_t a0246_lo = + vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo)); + const int32x4x2_t a1357_lo = + vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo)); + const int32x4x2_t a0246_hi = + vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi)); + const int32x4x2_t a1357_hi = + vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi)); + + // Swap 16 bit elements resulting in: + // b0: + // 00 08 16 24 32 40 48 56 + // 01 09 17 25 33 41 49 57 + // b1: + // 02 10 18 26 34 42 50 58 + // 03 11 19 27 35 43 51 59 + // b2: + // 04 12 20 28 36 44 52 60 + // 05 13 21 29 37 45 53 61 + // b3: + // 06 14 22 30 38 46 54 62 + // 07 15 23 31 39 47 55 63 + const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]), + vreinterpretq_s16_s32(a1357_lo.val[0])); + const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]), + vreinterpretq_s16_s32(a1357_lo.val[1])); + const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]), + vreinterpretq_s16_s32(a1357_hi.val[0])); + const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]), + vreinterpretq_s16_s32(a1357_hi.val[1])); + + *a0 = b0.val[0]; + *a1 = b0.val[1]; + *a2 = b1.val[0]; + *a3 = b1.val[1]; + *a4 = b2.val[0]; + *a5 = b2.val[1]; + *a6 = b3.val[0]; + *a7 = b3.val[1]; +} + +void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int16x8_t a0 = vld1q_s16(src_diff); + int16x8_t a1 = vld1q_s16(src_diff + src_stride); + int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); + int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); + int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); + int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); + int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); + int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + // Skip the second transpose because it is not required. + + vst1q_s16(coeff + 0, a0); + vst1q_s16(coeff + 8, a1); + vst1q_s16(coeff + 16, a2); + vst1q_s16(coeff + 24, a3); + vst1q_s16(coeff + 32, a4); + vst1q_s16(coeff + 40, a5); + vst1q_s16(coeff + 48, a6); + vst1q_s16(coeff + 56, a7); +} + +void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int i; + + /* Rearrange 16x16 to 8x32 and remove stride. + * Top left first. */ + aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); + /* Top right. */ + aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); + /* Bottom left. */ + aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); + /* Bottom right. */ + aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); + + for (i = 0; i < 64; i += 8) { + const int16x8_t a0 = vld1q_s16(coeff + 0); + const int16x8_t a1 = vld1q_s16(coeff + 64); + const int16x8_t a2 = vld1q_s16(coeff + 128); + const int16x8_t a3 = vld1q_s16(coeff + 192); + + const int16x8_t b0 = vhaddq_s16(a0, a1); + const int16x8_t b1 = vhsubq_s16(a0, a1); + const int16x8_t b2 = vhaddq_s16(a2, a3); + const int16x8_t b3 = vhsubq_s16(a2, a3); + + const int16x8_t c0 = vaddq_s16(b0, b2); + const int16x8_t c1 = vaddq_s16(b1, b3); + const int16x8_t c2 = vsubq_s16(b0, b2); + const int16x8_t c3 = vsubq_s16(b1, b3); + + vst1q_s16(coeff + 0, c0); + vst1q_s16(coeff + 64, c1); + vst1q_s16(coeff + 128, c2); + vst1q_s16(coeff + 192, c3); + + coeff += 8; + } +} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm new file mode 100644 index 000000000..d01c4bc03 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm @@ -0,0 +1,201 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + + EXPORT |aom_idct16x16_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct16x16_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 6) + add r0, r0, #32 ; + (1 <<((6) - 1)) + asr r0, r0, #6 ; >> 6 + + vdup.s16 q0, r0 ; duplicate a1 + mov r0, #8 + sub r2, #8 + + ; load destination data row0 - row3 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row4 - row7 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row8 - row11 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row12 - row15 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |aom_idct16x16_1_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c new file mode 100644 index 000000000..196b2a890 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/inv_txfm.h" +#include "aom_ports/mem.h" + +void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, j, a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + for (d1 = d2 = dest, i = 0; i < 4; i++) { + for (j = 0; j < 2; j++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm new file mode 100644 index 000000000..4a8f8f183 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm @@ -0,0 +1,1182 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_idct16x16_256_add_neon_pass1| + EXPORT |aom_idct16x16_256_add_neon_pass2| + EXPORT |aom_idct16x16_10_add_neon_pass1| + EXPORT |aom_idct16x16_10_add_neon_pass2| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void |aom_idct16x16_256_add_neon_pass1|(int16_t *input, +; int16_t *output, int output_stride) +; +; r0 int16_t input +; r1 int16_t *output +; r2 int output_stride) + +; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_256_add_neon_pass1| PROC + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q1,q2}, [r0]! + vmov.s16 q15, q1 + + ; generate cospi_28_64 = 3196 + mov r3, #0xc00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r12, #0x3e00 + add r12, #0xc5 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r12 ; duplicate cospi_4_64 + + ; preloading to avoid stall + ; generate cospi_12_64 = 13623 + mov r3, #0x3500 + add r3, #0x37 + + ; generate cospi_20_64 = 9102 + mov r12, #0x2300 + add r12, #0x8e + + ; step2[4] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; step2[4] * cospi_4_64 + vmull.s16 q5, d18, d1 + vmull.s16 q6, d19, d1 + + ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 + vmlal.s16 q5, d30, d0 + vmlal.s16 q6, d31, d0 + + vdup.16 d2, r3 ; duplicate cospi_12_64 + vdup.16 d3, r12 ; duplicate cospi_20_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d14, q5, #14 ; >> 14 + vqrshrn.s32 d15, q6, #14 ; >> 14 + + ; preloading to avoid stall + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + ; generate cospi_24_64 = 6270 + mov r12, #0x1800 + add r12, #0x7e + + ; step2[5] * cospi_12_64 + vmull.s16 q2, d26, d2 + vmull.s16 q3, d27, d2 + + ; step2[5] * cospi_20_64 + vmull.s16 q9, d26, d3 + vmull.s16 q15, d27, d3 + + ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q2, d22, d3 + vmlsl.s16 q3, d23, d3 + + ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 + vmlal.s16 q9, d22, d2 + vmlal.s16 q15, d23, d2 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q2, #14 ; >> 14 + vqrshrn.s32 d11, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q15, #14 ; >> 14 + + ; stage 4 + vdup.16 d30, r3 ; cospi_16_64 + + ; step1[0] * cospi_16_64 + vmull.s16 q2, d16, d30 + vmull.s16 q11, d17, d30 + + ; step1[1] * cospi_16_64 + vmull.s16 q0, d24, d30 + vmull.s16 q1, d25, d30 + + ; generate cospi_8_64 = 15137 + mov r3, #0x3b00 + add r3, #0x21 + + vdup.16 d30, r12 ; duplicate cospi_24_64 + vdup.16 d31, r3 ; duplicate cospi_8_64 + + ; temp1 = (step1[0] + step1[1]) * cospi_16_64 + vadd.s32 q3, q2, q0 + vadd.s32 q12, q11, q1 + + ; temp2 = (step1[0] - step1[1]) * cospi_16_64 + vsub.s32 q13, q2, q0 + vsub.s32 q1, q11, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d16, q3, #14 ; >> 14 + vqrshrn.s32 d17, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d18, q13, #14 ; >> 14 + vqrshrn.s32 d19, q1, #14 ; >> 14 + + ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + ; step1[2] * cospi_8_64 + vmull.s16 q0, d20, d31 + vmull.s16 q1, d21, d31 + + ; step1[2] * cospi_24_64 + vmull.s16 q12, d20, d30 + vmull.s16 q13, d21, d30 + + ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q0, d28, d30 + vmlal.s16 q1, d29, d30 + + ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlsl.s16 q12, d28, d31 + vmlsl.s16 q13, d29, d31 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d22, q0, #14 ; >> 14 + vqrshrn.s32 d23, q1, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d20, q12, #14 ; >> 14 + vqrshrn.s32 d21, q13, #14 ; >> 14 + + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; + vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; + + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + ; stage 5 + vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; + vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; + vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; + vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; + + vdup.16 d16, r3; ; duplicate cospi_16_64 + + ; step2[5] * cospi_16_64 + vmull.s16 q11, d26, d16 + vmull.s16 q12, d27, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; temp1 = (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q6, q9, q11 + vsub.s32 q13, q10, q12 + + ; temp2 = (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q6, #14 ; >> 14 + vqrshrn.s32 d11, q13, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 6 + vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; + + ; store the data + vst1.64 {d16}, [r1], r2 + vst1.64 {d17}, [r1], r2 + vst1.64 {d18}, [r1], r2 + vst1.64 {d19}, [r1], r2 + vst1.64 {d20}, [r1], r2 + vst1.64 {d21}, [r1], r2 + vst1.64 {d22}, [r1], r2 + vst1.64 {d23}, [r1], r2 + vst1.64 {d24}, [r1], r2 + vst1.64 {d25}, [r1], r2 + vst1.64 {d26}, [r1], r2 + vst1.64 {d27}, [r1], r2 + vst1.64 {d28}, [r1], r2 + vst1.64 {d29}, [r1], r2 + vst1.64 {d30}, [r1], r2 + vst1.64 {d31}, [r1], r2 + + bx lr + ENDP ; |aom_idct16x16_256_add_neon_pass1| + +;void aom_idct16x16_256_add_neon_pass2(int16_t *src, +; int16_t *output, +; int16_t *pass1Output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 int16_t *src +; r1 int16_t *output, +; r2 int16_t *pass1Output, +; r3 int16_t skip_adding, +; r4 uint8_t *dest, +; r5 int dest_stride) + +; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_256_add_neon_pass2| PROC + push {r3-r9} + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q0,q1}, [r0]! + vmov.s16 q15, q0; + + ; generate cospi_30_64 = 1606 + mov r3, #0x0600 + add r3, #0x46 + + ; generate cospi_2_64 = 16305 + mov r12, #0x3f00 + add r12, #0xb1 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 d12, r3 ; duplicate cospi_30_64 + vdup.16 d13, r12 ; duplicate cospi_2_64 + + ; preloading to avoid stall + ; generate cospi_14_64 = 12665 + mov r3, #0x3100 + add r3, #0x79 + + ; generate cospi_18_64 = 10394 + mov r12, #0x2800 + add r12, #0x9a + + ; step1[8] * cospi_30_64 + vmull.s16 q2, d16, d12 + vmull.s16 q3, d17, d12 + + ; step1[8] * cospi_2_64 + vmull.s16 q1, d16, d13 + vmull.s16 q4, d17, d13 + + ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 + vmlsl.s16 q2, d30, d13 + vmlsl.s16 q3, d31, d13 + + ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 + vmlal.s16 q1, d30, d12 + vmlal.s16 q4, d31, d12 + + vdup.16 d30, r3 ; duplicate cospi_14_64 + vdup.16 d31, r12 ; duplicate cospi_18_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d0, q2, #14 ; >> 14 + vqrshrn.s32 d1, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d14, q1, #14 ; >> 14 + vqrshrn.s32 d15, q4, #14 ; >> 14 + + ; preloading to avoid stall + ; generate cospi_22_64 = 7723 + mov r3, #0x1e00 + add r3, #0x2b + + ; generate cospi_10_64 = 14449 + mov r12, #0x3800 + add r12, #0x71 + + ; step1[9] * cospi_14_64 + vmull.s16 q2, d24, d30 + vmull.s16 q3, d25, d30 + + ; step1[9] * cospi_18_64 + vmull.s16 q4, d24, d31 + vmull.s16 q5, d25, d31 + + ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 + vmlsl.s16 q2, d22, d31 + vmlsl.s16 q3, d23, d31 + + ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 + vmlal.s16 q4, d22, d30 + vmlal.s16 q5, d23, d30 + + vdup.16 d30, r3 ; duplicate cospi_22_64 + vdup.16 d31, r12 ; duplicate cospi_10_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q2, #14 ; >> 14 + vqrshrn.s32 d3, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q4, #14 ; >> 14 + vqrshrn.s32 d13, q5, #14 ; >> 14 + + ; step1[10] * cospi_22_64 + vmull.s16 q11, d20, d30 + vmull.s16 q12, d21, d30 + + ; step1[10] * cospi_10_64 + vmull.s16 q4, d20, d31 + vmull.s16 q5, d21, d31 + + ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 + vmlsl.s16 q11, d26, d31 + vmlsl.s16 q12, d27, d31 + + ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 + vmlal.s16 q4, d26, d30 + vmlal.s16 q5, d27, d30 + + ; preloading to avoid stall + ; generate cospi_6_64 = 15679 + mov r3, #0x3d00 + add r3, #0x3f + + ; generate cospi_26_64 = 4756 + mov r12, #0x1200 + add r12, #0x94 + + vdup.16 d30, r3 ; duplicate cospi_6_64 + vdup.16 d31, r12 ; duplicate cospi_26_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q11, #14 ; >> 14 + vqrshrn.s32 d5, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d11, q5, #14 ; >> 14 + vqrshrn.s32 d10, q4, #14 ; >> 14 + + ; step1[11] * cospi_6_64 + vmull.s16 q10, d28, d30 + vmull.s16 q11, d29, d30 + + ; step1[11] * cospi_26_64 + vmull.s16 q12, d28, d31 + vmull.s16 q13, d29, d31 + + ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 + vmlsl.s16 q10, d18, d31 + vmlsl.s16 q11, d19, d31 + + ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 + vmlal.s16 q12, d18, d30 + vmlal.s16 q13, d19, d30 + + vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] + vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q11, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d8, q12, #14 ; >> 14 + vqrshrn.s32 d9, q13, #14 ; >> 14 + + ; stage 3 + vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] + vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] + vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] + vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] + vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] + vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] + + ; stage 4 + ; generate cospi_24_64 = 6270 + mov r3, #0x1800 + add r3, #0x7e + + ; generate cospi_8_64 = 15137 + mov r12, #0x3b00 + add r12, #0x21 + + ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vdup.16 d30, r12 ; duplicate cospi_8_64 + vdup.16 d31, r3 ; duplicate cospi_24_64 + + ; step1[9] * cospi_24_64 + vmull.s16 q2, d18, d31 + vmull.s16 q3, d19, d31 + + ; step1[14] * cospi_24_64 + vmull.s16 q4, d28, d31 + vmull.s16 q5, d29, d31 + + ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 + vmlal.s16 q2, d28, d30 + vmlal.s16 q3, d29, d30 + + ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vmlsl.s16 q4, d18, d30 + vmlsl.s16 q5, d19, d30 + + rsb r12, #0 + vdup.16 d30, r12 ; duplicate -cospi_8_64 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q2, #14 ; >> 14 + vqrshrn.s32 d13, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q4, #14 ; >> 14 + vqrshrn.s32 d3, q5, #14 ; >> 14 + + vmov.s16 q3, q11 + vmov.s16 q4, q12 + + ; - step1[13] * cospi_8_64 + vmull.s16 q11, d26, d30 + vmull.s16 q12, d27, d30 + + ; -step1[10] * cospi_8_64 + vmull.s16 q8, d20, d30 + vmull.s16 q9, d21, d30 + + ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlsl.s16 q11, d20, d31 + vmlsl.s16 q12, d21, d31 + + ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlal.s16 q8, d26, d31 + vmlal.s16 q9, d27, d31 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d4, q11, #14 ; >> 14 + vqrshrn.s32 d5, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q8, #14 ; >> 14 + vqrshrn.s32 d11, q9, #14 ; >> 14 + + ; stage 5 + vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; + vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; + vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; + vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; + vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; + vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; + vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; + vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; + + ; stage 6. + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + vdup.16 d14, r12 ; duplicate cospi_16_64 + + ; step1[13] * cospi_16_64 + vmull.s16 q3, d26, d14 + vmull.s16 q4, d27, d14 + + ; step1[10] * cospi_16_64 + vmull.s16 q0, d20, d14 + vmull.s16 q1, d21, d14 + + ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 + vsub.s32 q5, q3, q0 + vsub.s32 q6, q4, q1 + + ; temp2 = (step1[10] + step1[13]) * cospi_16_64 + vadd.s32 q10, q3, q0 + vadd.s32 q4, q4, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q5, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q10, #14 ; >> 14 + vqrshrn.s32 d11, q4, #14 ; >> 14 + + ; step1[11] * cospi_16_64 + vmull.s16 q0, d22, d14 + vmull.s16 q1, d23, d14 + + ; step1[12] * cospi_16_64 + vmull.s16 q13, d24, d14 + vmull.s16 q6, d25, d14 + + ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 + vsub.s32 q10, q13, q0 + vsub.s32 q4, q6, q1 + + ; temp2 = (step1[11] + step1[12]) * cospi_16_64 + vadd.s32 q13, q13, q0 + vadd.s32 q6, q6, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d8, q13, #14 ; >> 14 + vqrshrn.s32 d9, q6, #14 ; >> 14 + + mov r4, #16 ; pass1Output stride + ldr r3, [sp] ; load skip_adding + cmp r3, #0 ; check if need adding dest data + beq skip_adding_dest + + ldr r7, [sp, #28] ; dest used to save element 0-7 + mov r9, r7 ; save dest pointer for later use + ldr r8, [sp, #32] ; load dest_stride + + ; stage 7 + ; load the data in pass1 + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + + ; store the data output 8,9,10,11,12,13,14,15 + vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q8 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q9, q9, #6 + vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q9 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q2, q2, #6 + vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q2 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q3, q3, #6 + vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q3 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q4, q4, #6 + vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q4 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q5, q5, #6 + vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q5 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q14, q14, #6 + vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q14 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q15, q15, #6 + vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q15 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + b end_idct16x16_pass2 + +skip_adding_dest + ; stage 7 + ; load the data in pass1 + mov r5, #24 + mov r3, #8 + + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vst1.64 {d24}, [r1], r3 ; store output[0] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[1] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vst1.64 {d24}, [r1], r3 ; store output[2] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[3] + vst1.64 {d27}, [r1], r5 + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vst1.64 {d24}, [r1], r3 ; store output[4] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[5] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + vst1.64 {d24}, [r1], r3 ; store output[6] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[7] + vst1.64 {d27}, [r1], r5 + + ; store the data output 8,9,10,11,12,13,14,15 + vst1.64 {d16}, [r1], r3 + vst1.64 {d17}, [r1], r5 + vst1.64 {d18}, [r1], r3 + vst1.64 {d19}, [r1], r5 + vst1.64 {d4}, [r1], r3 + vst1.64 {d5}, [r1], r5 + vst1.64 {d6}, [r1], r3 + vst1.64 {d7}, [r1], r5 + vst1.64 {d8}, [r1], r3 + vst1.64 {d9}, [r1], r5 + vst1.64 {d10}, [r1], r3 + vst1.64 {d11}, [r1], r5 + vst1.64 {d28}, [r1], r3 + vst1.64 {d29}, [r1], r5 + vst1.64 {d30}, [r1], r3 + vst1.64 {d31}, [r1], r5 +end_idct16x16_pass2 + pop {r3-r9} + bx lr + ENDP ; |aom_idct16x16_256_add_neon_pass2| + +;void |aom_idct16x16_10_add_neon_pass1|(int16_t *input, +; int16_t *output, int output_stride) +; +; r0 int16_t input +; r1 int16_t *output +; r2 int output_stride) + +; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_10_add_neon_pass1| PROC + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q1,q2}, [r0]! + vmov.s16 q15, q1 + + ; generate cospi_28_64*2 = 6392 + mov r3, #0x1800 + add r3, #0xf8 + + ; generate cospi_4_64*2 = 32138 + mov r12, #0x7d00 + add r12, #0x8a + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 q0, r3 ; duplicate cospi_28_64*2 + vdup.16 q1, r12 ; duplicate cospi_4_64*2 + + ; The following instructions use vqrdmulh to do the + ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, + ; double, and return the high 16 bits, effectively giving >> 15. Doubling + ; the constant will change this to >> 14. + ; dct_const_round_shift(step2[4] * cospi_28_64); + vqrdmulh.s16 q4, q9, q0 + + ; preloading to avoid stall + ; generate cospi_16_64*2 = 23170 + mov r3, #0x5a00 + add r3, #0x82 + + ; dct_const_round_shift(step2[4] * cospi_4_64); + vqrdmulh.s16 q7, q9, q1 + + ; stage 4 + vdup.16 q1, r3 ; cospi_16_64*2 + + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + vdup.16 d4, r3; ; duplicate cospi_16_64 + + ; dct_const_round_shift(step1[0] * cospi_16_64) + vqrdmulh.s16 q8, q8, q1 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d14, d4 + vmull.s16 q10, d15, d4 + + ; step2[5] * cospi_16_64 + vmull.s16 q12, d9, d4 + vmull.s16 q11, d8, d4 + + ; temp1 = (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q15, q10, q12 + vsub.s32 q6, q9, q11 + + ; temp2 = (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d11, q15, #14 ; >> 14 + vqrshrn.s32 d10, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 6 + vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; + vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; + vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; + vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; + vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; + vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; + vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; + + ; store the data + vst1.64 {d4}, [r1], r2 + vst1.64 {d5}, [r1], r2 + vst1.64 {d18}, [r1], r2 + vst1.64 {d19}, [r1], r2 + vst1.64 {d20}, [r1], r2 + vst1.64 {d21}, [r1], r2 + vst1.64 {d22}, [r1], r2 + vst1.64 {d23}, [r1], r2 + vst1.64 {d24}, [r1], r2 + vst1.64 {d25}, [r1], r2 + vst1.64 {d26}, [r1], r2 + vst1.64 {d27}, [r1], r2 + vst1.64 {d28}, [r1], r2 + vst1.64 {d29}, [r1], r2 + vst1.64 {d30}, [r1], r2 + vst1.64 {d31}, [r1], r2 + + bx lr + ENDP ; |aom_idct16x16_10_add_neon_pass1| + +;void aom_idct16x16_10_add_neon_pass2(int16_t *src, +; int16_t *output, +; int16_t *pass1Output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 int16_t *src +; r1 int16_t *output, +; r2 int16_t *pass1Output, +; r3 int16_t skip_adding, +; r4 uint8_t *dest, +; r5 int dest_stride) + +; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_10_add_neon_pass2| PROC + push {r3-r9} + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q0,q1}, [r0]! + vmov.s16 q15, q0; + + ; generate 2*cospi_30_64 = 3212 + mov r3, #0xc00 + add r3, #0x8c + + ; generate 2*cospi_2_64 = 32610 + mov r12, #0x7f00 + add r12, #0x62 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 q6, r3 ; duplicate 2*cospi_30_64 + + ; dct_const_round_shift(step1[8] * cospi_30_64) + vqrdmulh.s16 q0, q8, q6 + + vdup.16 q6, r12 ; duplicate 2*cospi_2_64 + + ; dct_const_round_shift(step1[8] * cospi_2_64) + vqrdmulh.s16 q7, q8, q6 + + ; preloading to avoid stall + ; generate 2*cospi_26_64 = 9512 + mov r12, #0x2500 + add r12, #0x28 + rsb r12, #0 + vdup.16 q15, r12 ; duplicate -2*cospi_26_64 + + ; generate 2*cospi_6_64 = 31358 + mov r3, #0x7a00 + add r3, #0x7e + vdup.16 q14, r3 ; duplicate 2*cospi_6_64 + + ; dct_const_round_shift(- step1[12] * cospi_26_64) + vqrdmulh.s16 q3, q9, q15 + + ; dct_const_round_shift(step1[12] * cospi_6_64) + vqrdmulh.s16 q4, q9, q14 + + ; stage 4 + ; generate cospi_24_64 = 6270 + mov r3, #0x1800 + add r3, #0x7e + vdup.16 d31, r3 ; duplicate cospi_24_64 + + ; generate cospi_8_64 = 15137 + mov r12, #0x3b00 + add r12, #0x21 + vdup.16 d30, r12 ; duplicate cospi_8_64 + + ; step1[14] * cospi_24_64 + vmull.s16 q12, d14, d31 + vmull.s16 q5, d15, d31 + + ; step1[9] * cospi_24_64 + vmull.s16 q2, d0, d31 + vmull.s16 q11, d1, d31 + + ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vmlsl.s16 q12, d0, d30 + vmlsl.s16 q5, d1, d30 + + ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 + vmlal.s16 q2, d14, d30 + vmlal.s16 q11, d15, d30 + + rsb r12, #0 + vdup.16 d30, r12 ; duplicate -cospi_8_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q12, #14 ; >> 14 + vqrshrn.s32 d3, q5, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q2, #14 ; >> 14 + vqrshrn.s32 d13, q11, #14 ; >> 14 + + ; - step1[13] * cospi_8_64 + vmull.s16 q10, d8, d30 + vmull.s16 q13, d9, d30 + + ; -step1[10] * cospi_8_64 + vmull.s16 q8, d6, d30 + vmull.s16 q9, d7, d30 + + ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 + vmlsl.s16 q10, d6, d31 + vmlsl.s16 q13, d7, d31 + + ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlal.s16 q8, d8, d31 + vmlal.s16 q9, d9, d31 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q10, #14 ; >> 14 + vqrshrn.s32 d5, q13, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q8, #14 ; >> 14 + vqrshrn.s32 d11, q9, #14 ; >> 14 + + ; stage 5 + vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; + vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; + vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; + vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; + vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; + vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; + vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; + vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; + + ; stage 6. + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + vdup.16 d14, r12 ; duplicate cospi_16_64 + + ; step1[13] * cospi_16_64 + vmull.s16 q3, d26, d14 + vmull.s16 q4, d27, d14 + + ; step1[10] * cospi_16_64 + vmull.s16 q0, d20, d14 + vmull.s16 q1, d21, d14 + + ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 + vsub.s32 q5, q3, q0 + vsub.s32 q6, q4, q1 + + ; temp2 = (step1[10] + step1[13]) * cospi_16_64 + vadd.s32 q0, q3, q0 + vadd.s32 q1, q4, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q5, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q0, #14 ; >> 14 + vqrshrn.s32 d11, q1, #14 ; >> 14 + + ; step1[11] * cospi_16_64 + vmull.s16 q0, d22, d14 + vmull.s16 q1, d23, d14 + + ; step1[12] * cospi_16_64 + vmull.s16 q13, d24, d14 + vmull.s16 q6, d25, d14 + + ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 + vsub.s32 q10, q13, q0 + vsub.s32 q4, q6, q1 + + ; temp2 = (step1[11] + step1[12]) * cospi_16_64 + vadd.s32 q13, q13, q0 + vadd.s32 q6, q6, q1 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); + vqrshrn.s32 d8, q13, #14 ; >> 14 + vqrshrn.s32 d9, q6, #14 ; >> 14 + + mov r4, #16 ; pass1Output stride + ldr r3, [sp] ; load skip_adding + + ; stage 7 + ; load the data in pass1 + mov r5, #24 + mov r3, #8 + + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vst1.64 {d24}, [r1], r3 ; store output[0] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[1] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vst1.64 {d24}, [r1], r3 ; store output[2] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[3] + vst1.64 {d27}, [r1], r5 + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vst1.64 {d24}, [r1], r3 ; store output[4] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[5] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + vst1.64 {d24}, [r1], r3 ; store output[6] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[7] + vst1.64 {d27}, [r1], r5 + + ; store the data output 8,9,10,11,12,13,14,15 + vst1.64 {d16}, [r1], r3 + vst1.64 {d17}, [r1], r5 + vst1.64 {d18}, [r1], r3 + vst1.64 {d19}, [r1], r5 + vst1.64 {d4}, [r1], r3 + vst1.64 {d5}, [r1], r5 + vst1.64 {d6}, [r1], r3 + vst1.64 {d7}, [r1], r5 + vst1.64 {d8}, [r1], r3 + vst1.64 {d9}, [r1], r5 + vst1.64 {d10}, [r1], r3 + vst1.64 {d11}, [r1], r5 + vst1.64 {d28}, [r1], r3 + vst1.64 {d29}, [r1], r5 + vst1.64 {d30}, [r1], r3 + vst1.64 {d31}, [r1], r5 +end_idct10_16x16_pass2 + pop {r3-r9} + bx lr + ENDP ; |aom_idct16x16_10_add_neon_pass2| + END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c new file mode 100644 index 000000000..b4cb7a0cd --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c @@ -0,0 +1,1295 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "aom_dsp/txfm_common.h" + +static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, + int16x8_t *q10s16, int16x8_t *q11s16, + int16x8_t *q12s16, int16x8_t *q13s16, + int16x8_t *q14s16, int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, + int output_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d0s16 = vdup_n_s16((int16_t)cospi_28_64); + d1s16 = vdup_n_s16((int16_t)cospi_4_64); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d18s16, d1s16); + q6s32 = vmull_s16(d19s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlal_s16(q5s32, d30s16, d0s16); + q6s32 = vmlal_s16(q6s32, d31s16, d0s16); + + d2s16 = vdup_n_s16((int16_t)cospi_12_64); + d3s16 = vdup_n_s16((int16_t)cospi_20_64); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q5s32, 14); + d15s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + q2s32 = vmull_s16(d26s16, d2s16); + q3s32 = vmull_s16(d27s16, d2s16); + q9s32 = vmull_s16(d26s16, d3s16); + q15s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q15s32 = vmlal_s16(q15s32, d23s16, d2s16); + + d10s16 = vqrshrn_n_s32(q2s32, 14); + d11s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q15s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + d30s16 = vdup_n_s16((int16_t)cospi_16_64); + + q2s32 = vmull_s16(d16s16, d30s16); + q11s32 = vmull_s16(d17s16, d30s16); + q0s32 = vmull_s16(d24s16, d30s16); + q1s32 = vmull_s16(d25s16, d30s16); + + d30s16 = vdup_n_s16((int16_t)cospi_24_64); + d31s16 = vdup_n_s16((int16_t)cospi_8_64); + + q3s32 = vaddq_s32(q2s32, q0s32); + q12s32 = vaddq_s32(q11s32, q1s32); + q13s32 = vsubq_s32(q2s32, q0s32); + q1s32 = vsubq_s32(q11s32, q1s32); + + d16s16 = vqrshrn_n_s32(q3s32, 14); + d17s16 = vqrshrn_n_s32(q12s32, 14); + d18s16 = vqrshrn_n_s32(q13s32, 14); + d19s16 = vqrshrn_n_s32(q1s32, 14); + q8s16 = vcombine_s16(d16s16, d17s16); + q9s16 = vcombine_s16(d18s16, d19s16); + + q0s32 = vmull_s16(d20s16, d31s16); + q1s32 = vmull_s16(d21s16, d31s16); + q12s32 = vmull_s16(d20s16, d30s16); + q13s32 = vmull_s16(d21s16, d30s16); + + q0s32 = vmlal_s16(q0s32, d28s16, d30s16); + q1s32 = vmlal_s16(q1s32, d29s16, d30s16); + q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); + + d22s16 = vqrshrn_n_s32(q0s32, 14); + d23s16 = vqrshrn_n_s32(q1s32, 14); + d20s16 = vqrshrn_n_s32(q12s32, 14); + d21s16 = vqrshrn_n_s32(q13s32, 14); + q10s16 = vcombine_s16(d20s16, d21s16); + q11s16 = vcombine_s16(d22s16, d23s16); + + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q15s16 = vaddq_s16(q6s16, q7s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + // stage 5 + q0s16 = vaddq_s16(q8s16, q11s16); + q1s16 = vaddq_s16(q9s16, q10s16); + q2s16 = vsubq_s16(q9s16, q10s16); + q3s16 = vsubq_s16(q8s16, q11s16); + + d16s16 = vdup_n_s16((int16_t)cospi_16_64); + + q11s32 = vmull_s16(d26s16, d16s16); + q12s32 = vmull_s16(d27s16, d16s16); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + + q6s32 = vsubq_s32(q9s32, q11s32); + q13s32 = vsubq_s32(q10s32, q12s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d10s16 = vqrshrn_n_s32(q6s32, 14); + d11s16 = vqrshrn_n_s32(q13s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q8s16 = vaddq_s16(q0s16, q15s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d16u64); + out += output_stride; + vst1_u64((uint64_t *)out, d17u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride) { + uint8_t *d; + uint8x8_t d12u8, d13u8; + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + uint64x1_t d24u64, d25u64, d26u64, d27u64; + int64x1_t d12s64, d13s64; + uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; + uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + // stage 3 + d12s16 = vdup_n_s16((int16_t)cospi_30_64); + d13s16 = vdup_n_s16((int16_t)cospi_2_64); + + q2s32 = vmull_s16(d16s16, d12s16); + q3s32 = vmull_s16(d17s16, d12s16); + q1s32 = vmull_s16(d16s16, d13s16); + q4s32 = vmull_s16(d17s16, d13s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); + q1s32 = vmlal_s16(q1s32, d30s16, d12s16); + q4s32 = vmlal_s16(q4s32, d31s16, d12s16); + + d0s16 = vqrshrn_n_s32(q2s32, 14); + d1s16 = vqrshrn_n_s32(q3s32, 14); + d14s16 = vqrshrn_n_s32(q1s32, 14); + d15s16 = vqrshrn_n_s32(q4s32, 14); + q0s16 = vcombine_s16(d0s16, d1s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d30s16 = vdup_n_s16((int16_t)cospi_14_64); + d31s16 = vdup_n_s16((int16_t)cospi_18_64); + + q2s32 = vmull_s16(d24s16, d30s16); + q3s32 = vmull_s16(d25s16, d30s16); + q4s32 = vmull_s16(d24s16, d31s16); + q5s32 = vmull_s16(d25s16, d31s16); + + q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); + q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); + q4s32 = vmlal_s16(q4s32, d22s16, d30s16); + q5s32 = vmlal_s16(q5s32, d23s16, d30s16); + + d2s16 = vqrshrn_n_s32(q2s32, 14); + d3s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q4s32, 14); + d13s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16((int16_t)cospi_22_64); + d31s16 = vdup_n_s16((int16_t)cospi_10_64); + + q11s32 = vmull_s16(d20s16, d30s16); + q12s32 = vmull_s16(d21s16, d30s16); + q4s32 = vmull_s16(d20s16, d31s16); + q5s32 = vmull_s16(d21s16, d31s16); + + q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); + q4s32 = vmlal_s16(q4s32, d26s16, d30s16); + q5s32 = vmlal_s16(q5s32, d27s16, d30s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d11s16 = vqrshrn_n_s32(q5s32, 14); + d10s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + d30s16 = vdup_n_s16((int16_t)cospi_6_64); + d31s16 = vdup_n_s16((int16_t)cospi_26_64); + + q10s32 = vmull_s16(d28s16, d30s16); + q11s32 = vmull_s16(d29s16, d30s16); + q12s32 = vmull_s16(d28s16, d31s16); + q13s32 = vmull_s16(d29s16, d31s16); + + q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); + q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); + q12s32 = vmlal_s16(q12s32, d18s16, d30s16); + q13s32 = vmlal_s16(q13s32, d19s16, d30s16); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q11s32, 14); + d8s16 = vqrshrn_n_s32(q12s32, 14); + d9s16 = vqrshrn_n_s32(q13s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 3 + q9s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q10s16 = vsubq_s16(q3s16, q2s16); + q11s16 = vaddq_s16(q2s16, q3s16); + q12s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q6s16, q7s16); + + // stage 4 + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d30s16 = vdup_n_s16((int16_t)cospi_8_64); + d31s16 = vdup_n_s16((int16_t)cospi_24_64); + + q2s32 = vmull_s16(d18s16, d31s16); + q3s32 = vmull_s16(d19s16, d31s16); + q4s32 = vmull_s16(d28s16, d31s16); + q5s32 = vmull_s16(d29s16, d31s16); + + q2s32 = vmlal_s16(q2s32, d28s16, d30s16); + q3s32 = vmlal_s16(q3s32, d29s16, d30s16); + q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); + + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q3s32, 14); + d2s16 = vqrshrn_n_s32(q4s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + q3s16 = q11s16; + q4s16 = q12s16; + + d30s16 = vdup_n_s16(-cospi_8_64); + q11s32 = vmull_s16(d26s16, d30s16); + q12s32 = vmull_s16(d27s16, d30s16); + q8s32 = vmull_s16(d20s16, d30s16); + q9s32 = vmull_s16(d21s16, d30s16); + + q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); + q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); + q8s32 = vmlal_s16(q8s32, d26s16, d31s16); + q9s32 = vmlal_s16(q9s32, d27s16, d31s16); + + d4s16 = vqrshrn_n_s32(q11s32, 14); + d5s16 = vqrshrn_n_s32(q12s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16((int16_t)cospi_16_64); + + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q10s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q10s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + if (skip_adding != 0) { + d = dest; + // load the data in pass1 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + d13s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + q12s16 = vrshrq_n_s16(q12s16, 6); + q13s16 = vrshrq_n_s16(q13s16, 6); + q12u16 = + vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); + q13u16 = + vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); + d += dest_stride; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + // store the data out 8,9,10,11,12,13,14,15 + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q8s16 = vrshrq_n_s16(q8s16, 6); + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q9s16 = vrshrq_n_s16(q9s16, 6); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q2s16 = vrshrq_n_s16(q2s16, 6); + q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q3s16 = vrshrq_n_s16(q3s16, 6); + q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q4s16 = vrshrq_n_s16(q4s16, 6); + q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q5s16 = vrshrq_n_s16(q5s16, 6); + q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + dest += dest_stride; + q14s16 = vrshrq_n_s16(q14s16, 6); + q14u16 = + vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + d += dest_stride; + + d12s64 = vld1_s64((int64_t *)dest); + q15s16 = vrshrq_n_s16(q15s16, 6); + q15u16 = + vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64)); + d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); + vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); + } else { // skip_adding_dest + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); + out += 12; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); + out += 4; + vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); + } + return; +} + +void aom_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out, + int output_stride) { + int16x4_t d4s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q6s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q15s32; + int16x8x2_t q0x2s16; + + q0x2s16 = vld2q_s16(in); + q8s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q9s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q10s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q11s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q12s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q13s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q14s16 = q0x2s16.val[0]; + in += 16; + q0x2s16 = vld2q_s16(in); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + // stage 3 + q0s16 = vdupq_n_s16((int16_t)(cospi_28_64 * 2)); + q1s16 = vdupq_n_s16((int16_t)(cospi_4_64 * 2)); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + // stage 4 + q1s16 = vdupq_n_s16((int16_t)(cospi_16_64 * 2)); + d4s16 = vdup_n_s16((int16_t)cospi_16_64); + + q8s16 = vqrdmulhq_s16(q8s16, q1s16); + + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + q9s32 = vmull_s16(d14s16, d4s16); + q10s32 = vmull_s16(d15s16, d4s16); + q12s32 = vmull_s16(d9s16, d4s16); + q11s32 = vmull_s16(d8s16, d4s16); + + q15s32 = vsubq_s32(q10s32, q12s32); + q6s32 = vsubq_s32(q9s32, q11s32); + q9s32 = vaddq_s32(q9s32, q11s32); + q10s32 = vaddq_s32(q10s32, q12s32); + + d11s16 = vqrshrn_n_s32(q15s32, 14); + d10s16 = vqrshrn_n_s32(q6s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q10s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 6 + q2s16 = vaddq_s16(q8s16, q7s16); + q9s16 = vaddq_s16(q8s16, q6s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q8s16, q4s16); + q12s16 = vsubq_s16(q8s16, q4s16); + q13s16 = vsubq_s16(q8s16, q5s16); + q14s16 = vsubq_s16(q8s16, q6s16); + q15s16 = vsubq_s16(q8s16, q7s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); + d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); + d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); + d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + // store the data + output_stride >>= 1; // output_stride / 2, out is int16_t + vst1_u64((uint64_t *)out, d4u64); + out += output_stride; + vst1_u64((uint64_t *)out, d5u64); + out += output_stride; + vst1_u64((uint64_t *)out, d18u64); + out += output_stride; + vst1_u64((uint64_t *)out, d19u64); + out += output_stride; + vst1_u64((uint64_t *)out, d20u64); + out += output_stride; + vst1_u64((uint64_t *)out, d21u64); + out += output_stride; + vst1_u64((uint64_t *)out, d22u64); + out += output_stride; + vst1_u64((uint64_t *)out, d23u64); + out += output_stride; + vst1_u64((uint64_t *)out, d24u64); + out += output_stride; + vst1_u64((uint64_t *)out, d25u64); + out += output_stride; + vst1_u64((uint64_t *)out, d26u64); + out += output_stride; + vst1_u64((uint64_t *)out, d27u64); + out += output_stride; + vst1_u64((uint64_t *)out, d28u64); + out += output_stride; + vst1_u64((uint64_t *)out, d29u64); + out += output_stride; + vst1_u64((uint64_t *)out, d30u64); + out += output_stride; + vst1_u64((uint64_t *)out, d31u64); + return; +} + +void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride) { + int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; + uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; + uint64x1_t d16u64, d17u64, d18u64, d19u64; + uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32; + int16x8x2_t q0x2s16; + (void)skip_adding; + (void)dest; + (void)dest_stride; + + q0x2s16 = vld2q_s16(src); + q8s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q9s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q10s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q11s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q12s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q13s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q14s16 = q0x2s16.val[0]; + src += 16; + q0x2s16 = vld2q_s16(src); + q15s16 = q0x2s16.val[0]; + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + // stage 3 + q6s16 = vdupq_n_s16((int16_t)(cospi_30_64 * 2)); + q0s16 = vqrdmulhq_s16(q8s16, q6s16); + q6s16 = vdupq_n_s16((int16_t)(cospi_2_64 * 2)); + q7s16 = vqrdmulhq_s16(q8s16, q6s16); + + q15s16 = vdupq_n_s16(-cospi_26_64 * 2); + q14s16 = vdupq_n_s16((int16_t)(cospi_6_64 * 2)); + q3s16 = vqrdmulhq_s16(q9s16, q15s16); + q4s16 = vqrdmulhq_s16(q9s16, q14s16); + + // stage 4 + d0s16 = vget_low_s16(q0s16); + d1s16 = vget_high_s16(q0s16); + d6s16 = vget_low_s16(q3s16); + d7s16 = vget_high_s16(q3s16); + d8s16 = vget_low_s16(q4s16); + d9s16 = vget_high_s16(q4s16); + d14s16 = vget_low_s16(q7s16); + d15s16 = vget_high_s16(q7s16); + + d30s16 = vdup_n_s16((int16_t)cospi_8_64); + d31s16 = vdup_n_s16((int16_t)cospi_24_64); + + q12s32 = vmull_s16(d14s16, d31s16); + q5s32 = vmull_s16(d15s16, d31s16); + q2s32 = vmull_s16(d0s16, d31s16); + q11s32 = vmull_s16(d1s16, d31s16); + + q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); + q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); + q2s32 = vmlal_s16(q2s32, d14s16, d30s16); + q11s32 = vmlal_s16(q11s32, d15s16, d30s16); + + d2s16 = vqrshrn_n_s32(q12s32, 14); + d3s16 = vqrshrn_n_s32(q5s32, 14); + d12s16 = vqrshrn_n_s32(q2s32, 14); + d13s16 = vqrshrn_n_s32(q11s32, 14); + q1s16 = vcombine_s16(d2s16, d3s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + d30s16 = vdup_n_s16(-cospi_8_64); + q10s32 = vmull_s16(d8s16, d30s16); + q13s32 = vmull_s16(d9s16, d30s16); + q8s32 = vmull_s16(d6s16, d30s16); + q9s32 = vmull_s16(d7s16, d30s16); + + q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); + q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); + q8s32 = vmlal_s16(q8s32, d8s16, d31s16); + q9s32 = vmlal_s16(q9s32, d9s16, d31s16); + + d4s16 = vqrshrn_n_s32(q10s32, 14); + d5s16 = vqrshrn_n_s32(q13s32, 14); + d10s16 = vqrshrn_n_s32(q8s32, 14); + d11s16 = vqrshrn_n_s32(q9s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + // stage 5 + q8s16 = vaddq_s16(q0s16, q3s16); + q9s16 = vaddq_s16(q1s16, q2s16); + q10s16 = vsubq_s16(q1s16, q2s16); + q11s16 = vsubq_s16(q0s16, q3s16); + q12s16 = vsubq_s16(q7s16, q4s16); + q13s16 = vsubq_s16(q6s16, q5s16); + q14s16 = vaddq_s16(q6s16, q5s16); + q15s16 = vaddq_s16(q7s16, q4s16); + + // stage 6 + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + + d14s16 = vdup_n_s16((int16_t)cospi_16_64); + q3s32 = vmull_s16(d26s16, d14s16); + q4s32 = vmull_s16(d27s16, d14s16); + q0s32 = vmull_s16(d20s16, d14s16); + q1s32 = vmull_s16(d21s16, d14s16); + + q5s32 = vsubq_s32(q3s32, q0s32); + q6s32 = vsubq_s32(q4s32, q1s32); + q0s32 = vaddq_s32(q3s32, q0s32); + q4s32 = vaddq_s32(q4s32, q1s32); + + d4s16 = vqrshrn_n_s32(q5s32, 14); + d5s16 = vqrshrn_n_s32(q6s32, 14); + d10s16 = vqrshrn_n_s32(q0s32, 14); + d11s16 = vqrshrn_n_s32(q4s32, 14); + q2s16 = vcombine_s16(d4s16, d5s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q0s32 = vmull_s16(d22s16, d14s16); + q1s32 = vmull_s16(d23s16, d14s16); + q13s32 = vmull_s16(d24s16, d14s16); + q6s32 = vmull_s16(d25s16, d14s16); + + q10s32 = vsubq_s32(q13s32, q0s32); + q4s32 = vsubq_s32(q6s32, q1s32); + q13s32 = vaddq_s32(q13s32, q0s32); + q6s32 = vaddq_s32(q6s32, q1s32); + + d6s16 = vqrshrn_n_s32(q10s32, 14); + d7s16 = vqrshrn_n_s32(q4s32, 14); + d8s16 = vqrshrn_n_s32(q13s32, 14); + d9s16 = vqrshrn_n_s32(q6s32, 14); + q3s16 = vcombine_s16(d6s16, d7s16); + q4s16 = vcombine_s16(d8s16, d9s16); + + // stage 7 + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q15s16); + q13s16 = vaddq_s16(q1s16, q14s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q14s16 = vsubq_s16(q1s16, q14s16); + q15s16 = vsubq_s16(q0s16, q15s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q10s16, q5s16); + q13s16 = vaddq_s16(q11s16, q4s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q4s16 = vsubq_s16(q11s16, q4s16); + q5s16 = vsubq_s16(q10s16, q5s16); + + q0s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q1s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q12s16 = vaddq_s16(q0s16, q3s16); + q13s16 = vaddq_s16(q1s16, q2s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q2s16 = vsubq_s16(q1s16, q2s16); + q3s16 = vsubq_s16(q0s16, q3s16); + + q10s16 = vld1q_s16(pass1Output); + pass1Output += 8; + q11s16 = vld1q_s16(pass1Output); + q12s16 = vaddq_s16(q10s16, q9s16); + q13s16 = vaddq_s16(q11s16, q8s16); + d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); + d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); + d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); + d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); + vst1_u64((uint64_t *)out, d24u64); + out += 4; + vst1_u64((uint64_t *)out, d25u64); + out += 12; + vst1_u64((uint64_t *)out, d26u64); + out += 4; + vst1_u64((uint64_t *)out, d27u64); + out += 12; + q8s16 = vsubq_s16(q11s16, q8s16); + q9s16 = vsubq_s16(q10s16, q9s16); + + d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); + d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); + d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); + d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); + d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); + d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); + d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); + d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); + d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); + d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); + d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); + d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); + d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); + d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); + d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); + d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); + + vst1_u64((uint64_t *)out, d16u64); + out += 4; + vst1_u64((uint64_t *)out, d17u64); + out += 12; + vst1_u64((uint64_t *)out, d18u64); + out += 4; + vst1_u64((uint64_t *)out, d19u64); + out += 12; + vst1_u64((uint64_t *)out, d4u64); + out += 4; + vst1_u64((uint64_t *)out, d5u64); + out += 12; + vst1_u64((uint64_t *)out, d6u64); + out += 4; + vst1_u64((uint64_t *)out, d7u64); + out += 12; + vst1_u64((uint64_t *)out, d8u64); + out += 4; + vst1_u64((uint64_t *)out, d9u64); + out += 12; + vst1_u64((uint64_t *)out, d10u64); + out += 4; + vst1_u64((uint64_t *)out, d11u64); + out += 12; + vst1_u64((uint64_t *)out, d28u64); + out += 4; + vst1_u64((uint64_t *)out, d29u64); + out += 12; + vst1_u64((uint64_t *)out, d30u64); + out += 4; + vst1_u64((uint64_t *)out, d31u64); + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_neon.c new file mode 100644 index 000000000..db0d4905b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_neon.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_dsp_common.h" + +void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output, + int output_stride); +void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride); +void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output, + int output_stride); +void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output, + int16_t *pass1Output, int16_t skip_adding, + uint8_t *dest, int dest_stride); + +#if HAVE_NEON_ASM +/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ +extern void aom_push_neon(int64_t *store); +extern void aom_pop_neon(int64_t *store); +#endif // HAVE_NEON_ASM + +void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, + int dest_stride) { +#if HAVE_NEON_ASM + int64_t store_reg[8]; +#endif + int16_t pass1_output[16 * 16] = { 0 }; + int16_t row_idct_output[16 * 16] = { 0 }; + +#if HAVE_NEON_ASM + // save d8-d15 register values. + aom_push_neon(store_reg); +#endif + + /* Parallel idct on the upper 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, + dest, dest_stride); + + /* Parallel idct on the lower 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8, + pass1_output, 0, dest, dest_stride); + + /* Parallel idct on the left 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, + pass1_output, 1, dest, dest_stride); + + /* Parallel idct on the right 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, + row_idct_output + 8, pass1_output, 1, + dest + 8, dest_stride); + +#if HAVE_NEON_ASM + // restore d8-d15 register values. + aom_pop_neon(store_reg); +#endif + + return; +} + +void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, + int dest_stride) { +#if HAVE_NEON_ASM + int64_t store_reg[8]; +#endif + int16_t pass1_output[16 * 16] = { 0 }; + int16_t row_idct_output[16 * 16] = { 0 }; + +#if HAVE_NEON_ASM + // save d8-d15 register values. + aom_push_neon(store_reg); +#endif + + /* Parallel idct on the upper 8 rows */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7 + // which will be saved into row_idct_output. + aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, + dest, dest_stride); + + /* Skip Parallel idct on the lower 8 rows as they are all 0s */ + + /* Parallel idct on the left 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, + pass1_output, 1, dest, dest_stride); + + /* Parallel idct on the right 8 columns */ + // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the + // stage 6 result in pass1_output. + aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); + + // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines + // with result in pass1(pass1_output) to calculate final result in stage 7. + // Then add the result to the destination data. + aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, + row_idct_output + 8, pass1_output, 1, + dest + 8, dest_stride); + +#if HAVE_NEON_ASM + // restore d8-d15 register values. + aom_pop_neon(store_reg); +#endif + + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm new file mode 100644 index 000000000..b04df2d0b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm @@ -0,0 +1,147 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + EXPORT |aom_idct32x32_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ;TODO(hkuang): put the following macros in a seperate + ;file so other idct function could also use them. + MACRO + LD_16x8 $src, $stride + vld1.8 {q8}, [$src], $stride + vld1.8 {q9}, [$src], $stride + vld1.8 {q10}, [$src], $stride + vld1.8 {q11}, [$src], $stride + vld1.8 {q12}, [$src], $stride + vld1.8 {q13}, [$src], $stride + vld1.8 {q14}, [$src], $stride + vld1.8 {q15}, [$src], $stride + MEND + + MACRO + ADD_DIFF_16x8 $diff + vqadd.u8 q8, q8, $diff + vqadd.u8 q9, q9, $diff + vqadd.u8 q10, q10, $diff + vqadd.u8 q11, q11, $diff + vqadd.u8 q12, q12, $diff + vqadd.u8 q13, q13, $diff + vqadd.u8 q14, q14, $diff + vqadd.u8 q15, q15, $diff + MEND + + MACRO + SUB_DIFF_16x8 $diff + vqsub.u8 q8, q8, $diff + vqsub.u8 q9, q9, $diff + vqsub.u8 q10, q10, $diff + vqsub.u8 q11, q11, $diff + vqsub.u8 q12, q12, $diff + vqsub.u8 q13, q13, $diff + vqsub.u8 q14, q14, $diff + vqsub.u8 q15, q15, $diff + MEND + + MACRO + ST_16x8 $dst, $stride + vst1.8 {q8}, [$dst], $stride + vst1.8 {q9}, [$dst], $stride + vst1.8 {q10},[$dst], $stride + vst1.8 {q11},[$dst], $stride + vst1.8 {q12},[$dst], $stride + vst1.8 {q13},[$dst], $stride + vst1.8 {q14},[$dst], $stride + vst1.8 {q15},[$dst], $stride + MEND + +;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride + +|aom_idct32x32_1_add_neon| PROC + push {lr} + pld [r1] + add r3, r1, #16 ; r3 dest + 16 for second loop + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 6) + add r0, r0, #32 ; + (1 <<((6) - 1)) + asrs r0, r0, #6 ; >> 6 + bge diff_positive_32_32 + +diff_negative_32_32 + neg r0, r0 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +diff_negative_32_32_loop + sub r0, #1 + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r12, r2 + + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r12, r2 + cmp r0, #2 + moveq r1, r3 + moveq r12, r3 + cmp r0, #0 + bne diff_negative_32_32_loop + pop {pc} + +diff_positive_32_32 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +diff_positive_32_32_loop + sub r0, #1 + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r12, r2 + + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r12, r2 + cmp r0, #2 + moveq r1, r3 + moveq r12, r3 + cmp r0, #0 + bne diff_positive_32_32_loop + pop {pc} + + ENDP ; |aom_idct32x32_1_add_neon| + END diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c new file mode 100644 index 000000000..547567c5b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" + +#include "aom_dsp/inv_txfm.h" +#include "aom_ports/mem.h" + +static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vld1q_u8(d); + d += d_stride; + *q9u8 = vld1q_u8(d); + d += d_stride; + *q10u8 = vld1q_u8(d); + d += d_stride; + *q11u8 = vld1q_u8(d); + d += d_stride; + *q12u8 = vld1q_u8(d); + d += d_stride; + *q13u8 = vld1q_u8(d); + d += d_stride; + *q14u8 = vld1q_u8(d); + d += d_stride; + *q15u8 = vld1q_u8(d); + return; +} + +static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqaddq_u8(*q8u8, qdiffu8); + *q9u8 = vqaddq_u8(*q9u8, qdiffu8); + *q10u8 = vqaddq_u8(*q10u8, qdiffu8); + *q11u8 = vqaddq_u8(*q11u8, qdiffu8); + *q12u8 = vqaddq_u8(*q12u8, qdiffu8); + *q13u8 = vqaddq_u8(*q13u8, qdiffu8); + *q14u8 = vqaddq_u8(*q14u8, qdiffu8); + *q15u8 = vqaddq_u8(*q15u8, qdiffu8); + return; +} + +static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + *q8u8 = vqsubq_u8(*q8u8, qdiffu8); + *q9u8 = vqsubq_u8(*q9u8, qdiffu8); + *q10u8 = vqsubq_u8(*q10u8, qdiffu8); + *q11u8 = vqsubq_u8(*q11u8, qdiffu8); + *q12u8 = vqsubq_u8(*q12u8, qdiffu8); + *q13u8 = vqsubq_u8(*q13u8, qdiffu8); + *q14u8 = vqsubq_u8(*q14u8, qdiffu8); + *q15u8 = vqsubq_u8(*q15u8, qdiffu8); + return; +} + +static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, + uint8x16_t *q9u8, uint8x16_t *q10u8, + uint8x16_t *q11u8, uint8x16_t *q12u8, + uint8x16_t *q13u8, uint8x16_t *q14u8, + uint8x16_t *q15u8) { + vst1q_u8(d, *q8u8); + d += d_stride; + vst1q_u8(d, *q9u8); + d += d_stride; + vst1q_u8(d, *q10u8); + d += d_stride; + vst1q_u8(d, *q11u8); + d += d_stride; + vst1q_u8(d, *q12u8); + d += d_stride; + vst1q_u8(d, *q13u8); + d += d_stride; + vst1q_u8(d, *q14u8); + d += d_stride; + vst1q_u8(d, *q15u8); + return; +} + +void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int i, j, dest_stride8; + uint8_t *d; + int16_t a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 6); + + dest_stride8 = dest_stride * 8; + if (a1 >= 0) { // diff_positive_32_32 + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + d += dest_stride8; + } + } + } else { // diff_negative_32_32 + a1 = -a1; + a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; + q0u8 = vdupq_n_u8(a1); + for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop + d = dest; + for (j = 0; j < 4; j++) { + LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, + &q14u8, &q15u8); + d += dest_stride8; + } + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm new file mode 100644 index 000000000..e7793fb16 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm @@ -0,0 +1,1302 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +;TODO(cd): adjust these constant to be able to use vqdmulh for faster +; dct_const_round_shift(a * b) within butterfly calculations. +cospi_1_64 EQU 16364 +cospi_2_64 EQU 16305 +cospi_3_64 EQU 16207 +cospi_4_64 EQU 16069 +cospi_5_64 EQU 15893 +cospi_6_64 EQU 15679 +cospi_7_64 EQU 15426 +cospi_8_64 EQU 15137 +cospi_9_64 EQU 14811 +cospi_10_64 EQU 14449 +cospi_11_64 EQU 14053 +cospi_12_64 EQU 13623 +cospi_13_64 EQU 13160 +cospi_14_64 EQU 12665 +cospi_15_64 EQU 12140 +cospi_16_64 EQU 11585 +cospi_17_64 EQU 11003 +cospi_18_64 EQU 10394 +cospi_19_64 EQU 9760 +cospi_20_64 EQU 9102 +cospi_21_64 EQU 8423 +cospi_22_64 EQU 7723 +cospi_23_64 EQU 7005 +cospi_24_64 EQU 6270 +cospi_25_64 EQU 5520 +cospi_26_64 EQU 4756 +cospi_27_64 EQU 3981 +cospi_28_64 EQU 3196 +cospi_29_64 EQU 2404 +cospi_30_64 EQU 1606 +cospi_31_64 EQU 804 + + + EXPORT |aom_idct32x32_1024_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + AREA Block, CODE, READONLY + + ; -------------------------------------------------------------------------- + ; Load from transposed_buffer + ; q13 = transposed_buffer[first_offset] + ; q14 = transposed_buffer[second_offset] + ; for proper address calculation, the last offset used when manipulating + ; transposed_buffer must be passed in. use 0 for first use. + MACRO + LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset + ; address calculation with proper stride and loading + add r0, #($first_offset - $prev_offset )*8*2 + vld1.s16 {q14}, [r0] + add r0, #($second_offset - $first_offset)*8*2 + vld1.s16 {q13}, [r0] + ; (used) two registers (q14, q13) + MEND + ; -------------------------------------------------------------------------- + ; Load from output (used as temporary storage) + ; reg1 = output[first_offset] + ; reg2 = output[second_offset] + ; for proper address calculation, the last offset used when manipulating + ; output, whether reading or storing) must be passed in. use 0 for first + ; use. + MACRO + LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 + ; address calculation with proper stride and loading + add r1, #($first_offset - $prev_offset )*32*2 + vld1.s16 {$reg1}, [r1] + add r1, #($second_offset - $first_offset)*32*2 + vld1.s16 {$reg2}, [r1] + ; (used) two registers ($reg1, $reg2) + MEND + ; -------------------------------------------------------------------------- + ; Store into output (sometimes as as temporary storage) + ; output[first_offset] = reg1 + ; output[second_offset] = reg2 + ; for proper address calculation, the last offset used when manipulating + ; output, whether reading or storing) must be passed in. use 0 for first + ; use. + MACRO + STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 + ; address calculation with proper stride and storing + add r1, #($first_offset - $prev_offset )*32*2 + vst1.16 {$reg1}, [r1] + add r1, #($second_offset - $first_offset)*32*2 + vst1.16 {$reg2}, [r1] + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10] + vst1.16 {d11}, [r9] + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10]! + vst1.16 {d11}, [r9]! + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6] + vst1.16 {d4}, [r7] + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6]! + vst1.16 {d4}, [r7]! + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Touches q8-q12, q15 (q13-q14 are preserved) + ; valid output registers are anything but q8-q11 + MACRO + DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + ; TODO(cd): have special case to re-use constants when they are similar for + ; consecutive butterflies + ; TODO(cd): have special case when both constants are the same, do the + ; additions/subtractions before the multiplies. + ; generate the constants + ; generate scalar constants + mov r8, #$first_constant & 0xFF00 + mov r12, #$second_constant & 0xFF00 + add r8, #$first_constant & 0x00FF + add r12, #$second_constant & 0x00FF + ; generate vector constants + vdup.16 d30, r8 + vdup.16 d31, r12 + ; (used) two for inputs (regA-regD), one for constants (q15) + ; do some multiplications (ordered for maximum latency hiding) + vmull.s16 q8, $regC, d30 + vmull.s16 q10, $regA, d31 + vmull.s16 q9, $regD, d30 + vmull.s16 q11, $regB, d31 + vmull.s16 q12, $regC, d31 + ; (used) five for intermediate (q8-q12), one for constants (q15) + ; do some addition/subtractions (to get back two register) + vsub.s32 q8, q8, q10 + vsub.s32 q9, q9, q11 + ; do more multiplications (ordered for maximum latency hiding) + vmull.s16 q10, $regD, d31 + vmull.s16 q11, $regA, d30 + vmull.s16 q15, $regB, d30 + ; (used) six for intermediate (q8-q12, q15) + ; do more addition/subtractions + vadd.s32 q11, q12, q11 + vadd.s32 q10, q10, q15 + ; (used) four for intermediate (q8-q11) + ; dct_const_round_shift + vqrshrn.s32 $reg1, q8, #14 + vqrshrn.s32 $reg2, q9, #14 + vqrshrn.s32 $reg3, q11, #14 + vqrshrn.s32 $reg4, q10, #14 + ; (used) two for results, well four d registers + MEND + ; -------------------------------------------------------------------------- + ; Touches q8-q12, q15 (q13-q14 are preserved) + ; valid output registers are anything but q8-q11 + MACRO + DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + MEND + ; -------------------------------------------------------------------------- + +;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +; +; r0 int16_t *input, +; r1 uint8_t *dest, +; r2 int dest_stride) +; loop counters +; r4 bands loop counter +; r5 pass loop counter +; r8 transpose loop counter +; combine-add pointers +; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) +; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) +; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) +; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) + +|aom_idct32x32_1024_add_neon| PROC + ; This function does one pass of idct32x32 transform. + ; + ; This is done by transposing the input and then doing a 1d transform on + ; columns. In the first pass, the transposed columns are the original + ; rows. In the second pass, after the transposition, the colums are the + ; original columns. + ; The 1d transform is done by looping over bands of eight columns (the + ; idct32_bands loop). For each band, the transform input transposition + ; is done on demand, one band of four 8x8 matrices at a time. The four + ; matrices are transposed by pairs (the idct32_transpose_pair loop). + push {r4-r11} + vpush {d8-d15} + ; stack operation + ; internal buffer used to transpose 8 lines into before transforming them + ; int16_t transpose_buffer[32 * 8]; + ; at sp + [4096, 4607] + ; results of the first pass (transpose and transform rows) + ; int16_t pass1[32 * 32]; + ; at sp + [0, 2047] + ; results of the second pass (transpose and transform columns) + ; int16_t pass2[32 * 32]; + ; at sp + [2048, 4095] + sub sp, sp, #512+2048+2048 + + ; r6 = dest + 31 * dest_stride + ; r7 = dest + 0 * dest_stride + ; r9 = dest + 15 * dest_stride + ; r10 = dest + 16 * dest_stride + rsb r6, r2, r2, lsl #5 + rsb r9, r2, r2, lsl #4 + add r10, r1, r2, lsl #4 + mov r7, r1 + add r6, r6, r1 + add r9, r9, r1 + ; r11 = -dest_stride + neg r11, r2 + ; r3 = input + mov r3, r0 + ; parameters for first pass + ; r0 = transpose_buffer[32 * 8] + add r0, sp, #4096 + ; r1 = pass1[32 * 32] + mov r1, sp + + mov r5, #0 ; initialize pass loop counter +idct32_pass_loop + mov r4, #4 ; initialize bands loop counter +idct32_bands_loop + mov r8, #2 ; initialize transpose loop counter +idct32_transpose_pair_loop + ; Load two horizontally consecutive 8x8 16bit data matrices. The first one + ; into q0-q7 and the second one into q8-q15. There is a stride of 64, + ; adjusted to 32 because of the two post-increments. + vld1.s16 {q8}, [r3]! + vld1.s16 {q0}, [r3]! + add r3, #32 + vld1.s16 {q9}, [r3]! + vld1.s16 {q1}, [r3]! + add r3, #32 + vld1.s16 {q10}, [r3]! + vld1.s16 {q2}, [r3]! + add r3, #32 + vld1.s16 {q11}, [r3]! + vld1.s16 {q3}, [r3]! + add r3, #32 + vld1.s16 {q12}, [r3]! + vld1.s16 {q4}, [r3]! + add r3, #32 + vld1.s16 {q13}, [r3]! + vld1.s16 {q5}, [r3]! + add r3, #32 + vld1.s16 {q14}, [r3]! + vld1.s16 {q6}, [r3]! + add r3, #32 + vld1.s16 {q15}, [r3]! + vld1.s16 {q7}, [r3]! + + ; Transpose the two 8x8 16bit data matrices. + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vswp d1, d8 + vswp d7, d14 + vswp d5, d12 + vswp d3, d10 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + ; Store both matrices after each other. There is a stride of 32, which + ; adjusts to nothing because of the post-increments. + vst1.16 {q8}, [r0]! + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + vst1.16 {q0}, [r0]! + vst1.16 {q1}, [r0]! + vst1.16 {q2}, [r0]! + vst1.16 {q3}, [r0]! + vst1.16 {q4}, [r0]! + vst1.16 {q5}, [r0]! + vst1.16 {q6}, [r0]! + vst1.16 {q7}, [r0]! + + ; increment pointers by adjusted stride (not necessary for r0/out) + ; go back by 7*32 for the seven lines moved fully by read and add + ; go back by 32 for the eigth line only read + ; advance by 16*2 to go the next pair + sub r3, r3, #7*32*2 + 32 - 16*2 + ; transpose pair loop processing + subs r8, r8, #1 + bne idct32_transpose_pair_loop + + ; restore r0/input to its original value + sub r0, r0, #32*8*2 + + ; Instead of doing the transforms stage by stage, it is done by loading + ; some input values and doing as many stages as possible to minimize the + ; storing/loading of intermediate results. To fit within registers, the + ; final coefficients are cut into four blocks: + ; BLOCK A: 16-19,28-31 + ; BLOCK B: 20-23,24-27 + ; BLOCK C: 8-10,11-15 + ; BLOCK D: 0-3,4-7 + ; Blocks A and C are straight calculation through the various stages. In + ; block B, further calculations are performed using the results from + ; block A. In block D, further calculations are performed using the results + ; from block C and then the final calculations are done using results from + ; block A and B which have been combined at the end of block B. + + ; -------------------------------------------------------------------------- + ; BLOCK A: 16-19,28-31 + ; -------------------------------------------------------------------------- + ; generate 16,17,30,31 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; + ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; + ;step1b[16][i] = dct_const_round_shift(temp1); + ;step1b[31][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 0, 1, 31 + DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; + ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; + ;step1b[17][i] = dct_const_round_shift(temp1); + ;step1b[30][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 31, 17, 15 + DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[16] = step1b[16][i] + step1b[17][i]; + ;step2[17] = step1b[16][i] - step1b[17][i]; + ;step2[30] = -step1b[30][i] + step1b[31][i]; + ;step2[31] = step1b[30][i] + step1b[31][i]; + vadd.s16 q4, q0, q1 + vsub.s16 q13, q0, q1 + vadd.s16 q6, q2, q3 + vsub.s16 q14, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; + ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; + ;step3[17] = dct_const_round_shift(temp1); + ;step3[30] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; generate 18,19,28,29 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; + ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; + ;step1b[18][i] = dct_const_round_shift(temp1); + ;step1b[29][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 15, 9, 23 + DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; + ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; + ;step1b[19][i] = dct_const_round_shift(temp1); + ;step1b[28][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 23, 25, 7 + DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[18] = -step1b[18][i] + step1b[19][i]; + ;step2[19] = step1b[18][i] + step1b[19][i]; + ;step2[28] = step1b[28][i] + step1b[29][i]; + ;step2[29] = step1b[28][i] - step1b[29][i]; + vsub.s16 q13, q3, q2 + vadd.s16 q3, q3, q2 + vsub.s16 q14, q1, q0 + vadd.s16 q2, q1, q0 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); + ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); + ;step3[29] = dct_const_round_shift(temp1); + ;step3[18] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 + ; -------------------------------------------------------------------------- + ; combine 16-19,28-31 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[16] = step1b[16][i] + step1b[19][i]; + ;step1[17] = step1b[17][i] + step1b[18][i]; + ;step1[18] = step1b[17][i] - step1b[18][i]; + ;step1[29] = step1b[30][i] - step1b[29][i]; + ;step1[30] = step1b[30][i] + step1b[29][i]; + ;step1[31] = step1b[31][i] + step1b[28][i]; + vadd.s16 q8, q4, q2 + vadd.s16 q9, q5, q0 + vadd.s16 q10, q7, q1 + vadd.s16 q15, q6, q3 + vsub.s16 q13, q5, q0 + vsub.s16 q14, q7, q1 + STORE_IN_OUTPUT 0, 16, 31, q8, q15 + STORE_IN_OUTPUT 31, 17, 30, q9, q10 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; + ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; + ;step2[18] = dct_const_round_shift(temp1); + ;step2[29] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 + STORE_IN_OUTPUT 30, 29, 18, q1, q0 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[19] = step1b[16][i] - step1b[19][i]; + ;step1[28] = step1b[31][i] - step1b[28][i]; + vsub.s16 q13, q4, q2 + vsub.s16 q14, q6, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; + ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; + ;step2[19] = dct_const_round_shift(temp1); + ;step2[28] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 + STORE_IN_OUTPUT 18, 19, 28, q4, q6 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK B: 20-23,24-27 + ; -------------------------------------------------------------------------- + ; generate 20,21,26,27 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; + ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; + ;step1b[20][i] = dct_const_round_shift(temp1); + ;step1b[27][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 7, 5, 27 + DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; + ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; + ;step1b[21][i] = dct_const_round_shift(temp1); + ;step1b[26][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 27, 21, 11 + DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[20] = step1b[20][i] + step1b[21][i]; + ;step2[21] = step1b[20][i] - step1b[21][i]; + ;step2[26] = -step1b[26][i] + step1b[27][i]; + ;step2[27] = step1b[26][i] + step1b[27][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; + ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; + ;step3[21] = dct_const_round_shift(temp1); + ;step3[26] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 22,23,24,25 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; + ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; + ;step1b[22][i] = dct_const_round_shift(temp1); + ;step1b[25][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 11, 13, 19 + DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; + ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; + ;step1b[23][i] = dct_const_round_shift(temp1); + ;step1b[24][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 19, 29, 3 + DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[22] = -step1b[22][i] + step1b[23][i]; + ;step2[23] = step1b[22][i] + step1b[23][i]; + ;step2[24] = step1b[24][i] + step1b[25][i]; + ;step2[25] = step1b[24][i] - step1b[25][i]; + vsub.s16 q14, q4, q5 + vadd.s16 q5, q4, q5 + vsub.s16 q13, q6, q7 + vadd.s16 q6, q6, q7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); + ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); + ;step3[25] = dct_const_round_shift(temp1); + ;step3[22] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 + ; -------------------------------------------------------------------------- + ; combine 20-23,24-27 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[22] = step1b[22][i] + step1b[21][i]; + ;step1[23] = step1b[23][i] + step1b[20][i]; + vadd.s16 q10, q7, q1 + vadd.s16 q11, q5, q0 + ;step1[24] = step1b[24][i] + step1b[27][i]; + ;step1[25] = step1b[25][i] + step1b[26][i]; + vadd.s16 q12, q6, q2 + vadd.s16 q15, q4, q3 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[16] = step1b[16][i] + step1b[23][i]; + ;step3[17] = step1b[17][i] + step1b[22][i]; + ;step3[22] = step1b[17][i] - step1b[22][i]; + ;step3[23] = step1b[16][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 + vadd.s16 q8, q14, q11 + vadd.s16 q9, q13, q10 + vsub.s16 q13, q13, q10 + vsub.s16 q11, q14, q11 + STORE_IN_OUTPUT 17, 17, 16, q9, q8 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[24] = step1b[31][i] - step1b[24][i]; + ;step3[25] = step1b[30][i] - step1b[25][i]; + ;step3[30] = step1b[30][i] + step1b[25][i]; + ;step3[31] = step1b[31][i] + step1b[24][i]; + LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 + vsub.s16 q8, q9, q12 + vadd.s16 q10, q14, q15 + vsub.s16 q14, q14, q15 + vadd.s16 q12, q9, q12 + STORE_IN_OUTPUT 31, 30, 31, q10, q12 + ; -------------------------------------------------------------------------- + ; TODO(cd) do some register allocation change to remove these push/pop + vpush {q8} ; [24] + vpush {q11} ; [23] + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; + ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; + ;step1[22] = dct_const_round_shift(temp1); + ;step1[25] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 31, 25, 22, q14, q13 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; + ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; + ;step1[23] = dct_const_round_shift(temp1); + ;step1[24] = dct_const_round_shift(temp2); + ; TODO(cd) do some register allocation change to remove these push/pop + vpop {q13} ; [23] + vpop {q14} ; [24] + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 22, 24, 23, q14, q13 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[20] = step1b[23][i] - step1b[20][i]; + ;step1[27] = step1b[24][i] - step1b[27][i]; + vsub.s16 q14, q5, q0 + vsub.s16 q13, q6, q2 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); + ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); + ;step2[27] = dct_const_round_shift(temp1); + ;step2[20] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[21] = step1b[22][i] - step1b[21][i]; + ;step1[26] = step1b[25][i] - step1b[26][i]; + vsub.s16 q14, q7, q1 + vsub.s16 q13, q4, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); + ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); + ;step2[26] = dct_const_round_shift(temp1); + ;step2[21] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[18] = step1b[18][i] + step1b[21][i]; + ;step3[19] = step1b[19][i] + step1b[20][i]; + ;step3[20] = step1b[19][i] - step1b[20][i]; + ;step3[21] = step1b[18][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 + vadd.s16 q8, q14, q1 + vadd.s16 q9, q13, q6 + vsub.s16 q13, q13, q6 + vsub.s16 q1, q14, q1 + STORE_IN_OUTPUT 19, 18, 19, q8, q9 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[27] = step1b[28][i] - step1b[27][i]; + ;step3[28] = step1b[28][i] + step1b[27][i]; + ;step3[29] = step1b[29][i] + step1b[26][i]; + ;step3[26] = step1b[29][i] - step1b[26][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 + vsub.s16 q14, q8, q5 + vadd.s16 q10, q8, q5 + vadd.s16 q11, q9, q0 + vsub.s16 q0, q9, q0 + STORE_IN_OUTPUT 29, 28, 29, q10, q11 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; + ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; + ;step1[20] = dct_const_round_shift(temp1); + ;step1[27] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 29, 20, 27, q13, q14 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; + ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; + ;step1[21] = dct_const_round_shift(temp1); + ;step1[26] = dct_const_round_shift(temp2); + DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 + STORE_IN_OUTPUT 27, 21, 26, q1, q0 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK C: 8-10,11-15 + ; -------------------------------------------------------------------------- + ; generate 8,9,14,15 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; + ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; + ;step2[8] = dct_const_round_shift(temp1); + ;step2[15] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 3, 2, 30 + DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; + ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; + ;step2[9] = dct_const_round_shift(temp1); + ;step2[14] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 30, 18, 14 + DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;step3[8] = step1b[8][i] + step1b[9][i]; + ;step3[9] = step1b[8][i] - step1b[9][i]; + ;step3[14] = step1b[15][i] - step1b[14][i]; + ;step3[15] = step1b[15][i] + step1b[14][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; + ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; + ;step1[9] = dct_const_round_shift(temp1); + ;step1[14] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 10,11,12,13 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; + ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; + ;step2[10] = dct_const_round_shift(temp1); + ;step2[13] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 14, 10, 22 + DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; + ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; + ;step2[11] = dct_const_round_shift(temp1); + ;step2[12] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 22, 26, 6 + DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;step3[10] = step1b[11][i] - step1b[10][i]; + ;step3[11] = step1b[11][i] + step1b[10][i]; + ;step3[12] = step1b[12][i] + step1b[13][i]; + ;step3[13] = step1b[12][i] - step1b[13][i]; + vsub.s16 q14, q4, q5 + vadd.s16 q5, q4, q5 + vsub.s16 q13, q6, q7 + vadd.s16 q6, q6, q7 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); + ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); + ;step1[13] = dct_const_round_shift(temp1); + ;step1[10] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 + ; -------------------------------------------------------------------------- + ; combine 8-10,11-15 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[8] = step1b[8][i] + step1b[11][i]; + ;step2[9] = step1b[9][i] + step1b[10][i]; + ;step2[10] = step1b[9][i] - step1b[10][i]; + vadd.s16 q8, q0, q5 + vadd.s16 q9, q1, q7 + vsub.s16 q13, q1, q7 + ;step2[13] = step1b[14][i] - step1b[13][i]; + ;step2[14] = step1b[14][i] + step1b[13][i]; + ;step2[15] = step1b[15][i] + step1b[12][i]; + vsub.s16 q14, q3, q4 + vadd.s16 q10, q3, q4 + vadd.s16 q15, q2, q6 + STORE_IN_OUTPUT 26, 8, 15, q8, q15 + STORE_IN_OUTPUT 15, 9, 14, q9, q10 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; + ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; + ;step3[10] = dct_const_round_shift(temp1); + ;step3[13] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + STORE_IN_OUTPUT 14, 13, 10, q3, q1 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[11] = step1b[8][i] - step1b[11][i]; + ;step2[12] = step1b[15][i] - step1b[12][i]; + vsub.s16 q13, q0, q5 + vsub.s16 q14, q2, q6 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; + ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; + ;step3[11] = dct_const_round_shift(temp1); + ;step3[12] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + STORE_IN_OUTPUT 10, 11, 12, q1, q3 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK D: 0-3,4-7 + ; -------------------------------------------------------------------------- + ; generate 4,5,6,7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; + ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; + ;step3[4] = dct_const_round_shift(temp1); + ;step3[7] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 6, 4, 28 + DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; + ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; + ;step3[5] = dct_const_round_shift(temp1); + ;step3[6] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 28, 20, 12 + DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[4] = step1b[4][i] + step1b[5][i]; + ;step1[5] = step1b[4][i] - step1b[5][i]; + ;step1[6] = step1b[7][i] - step1b[6][i]; + ;step1[7] = step1b[7][i] + step1b[6][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; + ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; + ;step2[5] = dct_const_round_shift(temp1); + ;step2[6] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 0,1,2,3 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; + ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; + ;step1[1] = dct_const_round_shift(temp1); + ;step1[0] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 12, 0, 16 + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; + ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; + ;step1[2] = dct_const_round_shift(temp1); + ;step1[3] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 16, 8, 24 + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[0] = step1b[0][i] + step1b[3][i]; + ;step2[1] = step1b[1][i] + step1b[2][i]; + ;step2[2] = step1b[1][i] - step1b[2][i]; + ;step2[3] = step1b[0][i] - step1b[3][i]; + vadd.s16 q4, q7, q6 + vsub.s16 q7, q7, q6 + vsub.s16 q6, q5, q14 + vadd.s16 q5, q5, q14 + ; -------------------------------------------------------------------------- + ; combine 0-3,4-7 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[0] = step1b[0][i] + step1b[7][i]; + ;step3[1] = step1b[1][i] + step1b[6][i]; + ;step3[2] = step1b[2][i] + step1b[5][i]; + ;step3[3] = step1b[3][i] + step1b[4][i]; + vadd.s16 q8, q4, q2 + vadd.s16 q9, q5, q3 + vadd.s16 q10, q6, q1 + vadd.s16 q11, q7, q0 + ;step3[4] = step1b[3][i] - step1b[4][i]; + ;step3[5] = step1b[2][i] - step1b[5][i]; + ;step3[6] = step1b[1][i] - step1b[6][i]; + ;step3[7] = step1b[0][i] - step1b[7][i]; + vsub.s16 q12, q7, q0 + vsub.s16 q13, q6, q1 + vsub.s16 q14, q5, q3 + vsub.s16 q15, q4, q2 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[0] = step1b[0][i] + step1b[15][i]; + ;step1[1] = step1b[1][i] + step1b[14][i]; + ;step1[14] = step1b[1][i] - step1b[14][i]; + ;step1[15] = step1b[0][i] - step1b[15][i]; + LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 + vadd.s16 q2, q8, q1 + vadd.s16 q3, q9, q0 + vsub.s16 q4, q9, q0 + vsub.s16 q5, q8, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[14 * 32] = step1b[14][i] + step1b[17][i]; + ;output[15 * 32] = step1b[15][i] + step1b[16][i]; + ;output[16 * 32] = step1b[15][i] - step1b[16][i]; + ;output[17 * 32] = step1b[14][i] - step1b[17][i]; + LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + + cmp r5, #0 + bgt idct32_bands_end_2nd_pass + +idct32_bands_end_1st_pass + STORE_IN_OUTPUT 17, 16, 17, q6, q7 + STORE_IN_OUTPUT 17, 14, 15, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 31, 30, 31, q6, q7 + STORE_IN_OUTPUT 31, 0, 1, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 19, 18, 19, q6, q7 + STORE_IN_OUTPUT 19, 12, 13, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 29, 28, 29, q6, q7 + STORE_IN_OUTPUT 29, 2, 3, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 21, 20, 21, q6, q7 + STORE_IN_OUTPUT 21, 10, 11, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 27, 26, 27, q6, q7 + STORE_IN_OUTPUT 27, 4, 5, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 23, 22, 23, q6, q7 + STORE_IN_OUTPUT 23, 8, 9, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 25, 24, 25, q6, q7 + STORE_IN_OUTPUT 25, 6, 7, q4, q5 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #7*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; parameters for second pass + ; the input of pass2 is the result of pass1. we have to remove the offset + ; of 32 columns induced by the above idct32_bands_loop + sub r3, r1, #32*2 + ; r1 = pass2[32 * 32] + add r1, sp, #2048 + + ; pass loop processing + add r5, r5, #1 + b idct32_pass_loop + +idct32_bands_end_2nd_pass + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; restore pointers to their initial indices for next band pass by + ; removing/adding dest_stride * 8. The actual increment by eight + ; is taken care of within the _LAST macros. + add r6, r6, r2, lsl #3 + add r9, r9, r2, lsl #3 + sub r7, r7, r2, lsl #3 + sub r10, r10, r2, lsl #3 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #25*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; stack operation + add sp, sp, #512+2048+2048 + vpop {d8-d15} + pop {r4-r11} + bx lr + ENDP ; |aom_idct32x32_1024_add_neon| + END diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c new file mode 100644 index 000000000..a7562c7d5 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c @@ -0,0 +1,686 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "aom_dsp/txfm_common.h" + +#define LOAD_FROM_TRANSPOSED(prev, first, second) \ + q14s16 = vld1q_s16(trans_buf + first * 8); \ + q13s16 = vld1q_s16(trans_buf + second * 8); + +#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ + qA = vld1q_s16(out + first * 32); \ + qB = vld1q_s16(out + second * 32); + +#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ + vst1q_s16(out + first * 32, qA); \ + vst1q_s16(out + second * 32, qB); + +#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ + __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16); +static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2, + int stride, int16x8_t q6s16, + int16x8_t q7s16, + int16x8_t q8s16, + int16x8_t q9s16) { + int16x4_t d8s16, d9s16, d10s16, d11s16; + + d8s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d11s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d9s16 = vld1_s16((int16_t *)p1); + d10s16 = vld1_s16((int16_t *)p2); + + q7s16 = vrshrq_n_s16(q7s16, 6); + q8s16 = vrshrq_n_s16(q8s16, 6); + q9s16 = vrshrq_n_s16(q9s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + + q7s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16))); + q8s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16))); + q9s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16))); + q6s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16))); + + d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); + d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); + d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + + vst1_s16((int16_t *)p1, d9s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d10s16); + p2 += stride; + vst1_s16((int16_t *)p1, d8s16); + vst1_s16((int16_t *)p2, d11s16); + return; +} + +#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \ + ; \ + __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16); +static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2, + int stride, int16x8_t q4s16, + int16x8_t q5s16, + int16x8_t q6s16, + int16x8_t q7s16) { + int16x4_t d4s16, d5s16, d6s16, d7s16; + + d4s16 = vld1_s16((int16_t *)p1); + p1 += stride; + d7s16 = vld1_s16((int16_t *)p2); + p2 -= stride; + d5s16 = vld1_s16((int16_t *)p1); + d6s16 = vld1_s16((int16_t *)p2); + + q5s16 = vrshrq_n_s16(q5s16, 6); + q6s16 = vrshrq_n_s16(q6s16, 6); + q7s16 = vrshrq_n_s16(q7s16, 6); + q4s16 = vrshrq_n_s16(q4s16, 6); + + q5s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16))); + q6s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16))); + q7s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16))); + q4s16 = vreinterpretq_s16_u16( + vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16))); + + d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); + d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); + d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); + d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); + + vst1_s16((int16_t *)p1, d5s16); + p1 -= stride; + vst1_s16((int16_t *)p2, d6s16); + p2 += stride; + vst1_s16((int16_t *)p2, d7s16); + vst1_s16((int16_t *)p1, d4s16); + return; +} + +#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ + DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); +static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16, + int16_t first_const, int16_t second_const, + int16x8_t *qAs16, int16x8_t *qBs16) { + int16x4_t d30s16, d31s16; + int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; + int16x4_t dCs16, dDs16, dAs16, dBs16; + + dCs16 = vget_low_s16(q14s16); + dDs16 = vget_high_s16(q14s16); + dAs16 = vget_low_s16(q13s16); + dBs16 = vget_high_s16(q13s16); + + d30s16 = vdup_n_s16(first_const); + d31s16 = vdup_n_s16(second_const); + + q8s32 = vmull_s16(dCs16, d30s16); + q10s32 = vmull_s16(dAs16, d31s16); + q9s32 = vmull_s16(dDs16, d30s16); + q11s32 = vmull_s16(dBs16, d31s16); + q12s32 = vmull_s16(dCs16, d31s16); + + q8s32 = vsubq_s32(q8s32, q10s32); + q9s32 = vsubq_s32(q9s32, q11s32); + + q10s32 = vmull_s16(dDs16, d31s16); + q11s32 = vmull_s16(dAs16, d30s16); + q15s32 = vmull_s16(dBs16, d30s16); + + q11s32 = vaddq_s32(q12s32, q11s32); + q10s32 = vaddq_s32(q10s32, q15s32); + + *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14)); + *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14)); + return; +} + +static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) { + int16_t *in; + int i; + const int stride = 32; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + for (i = 0; i < 4; i++, input += 8) { + in = input; + q8s16 = vld1q_s16(in); + in += stride; + q9s16 = vld1q_s16(in); + in += stride; + q10s16 = vld1q_s16(in); + in += stride; + q11s16 = vld1q_s16(in); + in += stride; + q12s16 = vld1q_s16(in); + in += stride; + q13s16 = vld1q_s16(in); + in += stride; + q14s16 = vld1q_s16(in); + in += stride; + q15s16 = vld1q_s16(in); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + d20s16 = vget_low_s16(q10s16); + d21s16 = vget_high_s16(q10s16); + d22s16 = vget_low_s16(q11s16); + d23s16 = vget_high_s16(q11s16); + d24s16 = vget_low_s16(q12s16); + d25s16 = vget_high_s16(q12s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + d30s16 = vget_low_s16(q15s16); + d31s16 = vget_high_s16(q15s16); + + q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + q12s16 = vcombine_s16(d17s16, d25s16); + q13s16 = vcombine_s16(d19s16, d27s16); + q14s16 = vcombine_s16(d21s16, d29s16); + q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16)); + q1x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16)); + q2x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16)); + q3x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + vst1q_s16(t_buf, q0x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q0x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q1x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q2x2s16.val[1]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[0]); + t_buf += 8; + vst1q_s16(t_buf, q3x2s16.val[1]); + t_buf += 8; + } + return; +} + +static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16, + int16x8_t q3s16, int16x8_t q6s16, + int16x8_t q7s16, int16x8_t q8s16, + int16x8_t q9s16, int16x8_t q10s16, + int16x8_t q11s16, int16x8_t q12s16, + int16x8_t q13s16, int16x8_t q14s16, + int16x8_t q15s16) { + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); + STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); + + LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); + STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); + + LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); + STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); + + LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); + STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); + + LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); + STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); + + LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); + STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); + + LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); + STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); + + LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); + STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); + return; +} + +static INLINE void idct32_bands_end_2nd_pass( + int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16, + int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16, + int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16, + int16x8_t q14s16, int16x8_t q15s16) { + uint8_t *r6 = dest + 31 * stride; + uint8_t *r7 = dest /* + 0 * stride*/; + uint8_t *r9 = dest + 15 * stride; + uint8_t *r10 = dest + 16 * stride; + int str2 = stride << 1; + int16x8_t q0s16, q1s16, q4s16, q5s16; + + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; + r9 -= str2; + + LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; + r6 -= str2; + + LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) + q2s16 = vaddq_s16(q10s16, q1s16); + q3s16 = vaddq_s16(q11s16, q0s16); + q4s16 = vsubq_s16(q11s16, q0s16); + q5s16 = vsubq_s16(q10s16, q1s16); + + LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; + r9 -= str2; + + LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; + r6 -= str2; + + LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) + q2s16 = vaddq_s16(q12s16, q1s16); + q3s16 = vaddq_s16(q13s16, q0s16); + q4s16 = vsubq_s16(q13s16, q0s16); + q5s16 = vsubq_s16(q12s16, q1s16); + + LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + r10 += str2; + r9 -= str2; + + LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + r7 += str2; + r6 -= str2; + + LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) + q2s16 = vaddq_s16(q14s16, q1s16); + q3s16 = vaddq_s16(q15s16, q0s16); + q4s16 = vsubq_s16(q15s16, q0s16); + q5s16 = vsubq_s16(q14s16, q1s16); + + LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + STORE_COMBINE_CENTER_RESULTS(r10, r9); + + LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) + q4s16 = vaddq_s16(q2s16, q1s16); + q5s16 = vaddq_s16(q3s16, q0s16); + q6s16 = vsubq_s16(q3s16, q0s16); + q7s16 = vsubq_s16(q2s16, q1s16); + STORE_COMBINE_EXTREME_RESULTS(r7, r6); + return; +} + +void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) { + int i, idct32_pass_loop; + int16_t trans_buf[32 * 8]; + int16_t pass1[32 * 32]; + int16_t pass2[32 * 32]; + int16_t *out; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + + for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; + idct32_pass_loop++, + input = pass1, // the input of pass2 is the result of pass1 + out = pass2) { + for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop + idct32_transpose_pair(input, trans_buf); + + // ----------------------------------------- + // BLOCK A: 16-19,28-31 + // ----------------------------------------- + // generate 16,17,30,31 + // part of stage 1 + LOAD_FROM_TRANSPOSED(0, 1, 31) + DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(31, 17, 15) + DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) + // part of stage 2 + q4s16 = vaddq_s16(q0s16, q1s16); + q13s16 = vsubq_s16(q0s16, q1s16); + q6s16 = vaddq_s16(q2s16, q3s16); + q14s16 = vsubq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) + + // generate 18,19,28,29 + // part of stage 1 + LOAD_FROM_TRANSPOSED(15, 9, 23) + DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(23, 25, 7) + DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q3s16, q2s16); + q3s16 = vaddq_s16(q3s16, q2s16); + q14s16 = vsubq_s16(q1s16, q0s16); + q2s16 = vaddq_s16(q1s16, q0s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) + // part of stage 4 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q10s16 = vaddq_s16(q7s16, q1s16); + q15s16 = vaddq_s16(q6s16, q3s16); + q13s16 = vsubq_s16(q5s16, q0s16); + q14s16 = vsubq_s16(q7s16, q1s16); + STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) + STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) + STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) + // part of stage 4 + q13s16 = vsubq_s16(q4s16, q2s16); + q14s16 = vsubq_s16(q6s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) + STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) + + // ----------------------------------------- + // BLOCK B: 20-23,24-27 + // ----------------------------------------- + // generate 20,21,26,27 + // part of stage 1 + LOAD_FROM_TRANSPOSED(7, 5, 27) + DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(27, 21, 11) + DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) + // part of stage 2 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 3 + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + + // generate 22,23,24,25 + // part of stage 1 + LOAD_FROM_TRANSPOSED(11, 13, 19) + DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(19, 29, 3) + DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) + // part of stage 2 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 3 + DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) + // part of stage 4 + q10s16 = vaddq_s16(q7s16, q1s16); + q11s16 = vaddq_s16(q5s16, q0s16); + q12s16 = vaddq_s16(q6s16, q2s16); + q15s16 = vaddq_s16(q4s16, q3s16); + // part of stage 6 + LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q11s16); + q9s16 = vaddq_s16(q13s16, q10s16); + q13s16 = vsubq_s16(q13s16, q10s16); + q11s16 = vsubq_s16(q14s16, q11s16); + STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) + LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) + q8s16 = vsubq_s16(q9s16, q12s16); + q10s16 = vaddq_s16(q14s16, q15s16); + q14s16 = vsubq_s16(q14s16, q15s16); + q12s16 = vaddq_s16(q9s16, q12s16); + STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) + q13s16 = q11s16; + q14s16 = q8s16; + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) + // part of stage 4 + q14s16 = vsubq_s16(q5s16, q0s16); + q13s16 = vsubq_s16(q6s16, q2s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); + q14s16 = vsubq_s16(q7s16, q1s16); + q13s16 = vsubq_s16(q4s16, q3s16); + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); + // part of stage 6 + LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) + q8s16 = vaddq_s16(q14s16, q1s16); + q9s16 = vaddq_s16(q13s16, q6s16); + q13s16 = vsubq_s16(q13s16, q6s16); + q1s16 = vsubq_s16(q14s16, q1s16); + STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) + LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) + q14s16 = vsubq_s16(q8s16, q5s16); + q10s16 = vaddq_s16(q8s16, q5s16); + q11s16 = vaddq_s16(q9s16, q0s16); + q0s16 = vsubq_s16(q9s16, q0s16); + STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) + // part of stage 7 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) + STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) + DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16); + STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) + + // ----------------------------------------- + // BLOCK C: 8-10,11-15 + // ----------------------------------------- + // generate 8,9,14,15 + // part of stage 2 + LOAD_FROM_TRANSPOSED(3, 2, 30) + DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(30, 18, 14) + DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) + // part of stage 3 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 4 + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) + + // generate 10,11,12,13 + // part of stage 2 + LOAD_FROM_TRANSPOSED(14, 10, 22) + DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(22, 26, 6) + DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) + // part of stage 3 + q14s16 = vsubq_s16(q4s16, q5s16); + q5s16 = vaddq_s16(q4s16, q5s16); + q13s16 = vsubq_s16(q6s16, q7s16); + q6s16 = vaddq_s16(q6s16, q7s16); + // part of stage 4 + DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) + // part of stage 5 + q8s16 = vaddq_s16(q0s16, q5s16); + q9s16 = vaddq_s16(q1s16, q7s16); + q13s16 = vsubq_s16(q1s16, q7s16); + q14s16 = vsubq_s16(q3s16, q4s16); + q10s16 = vaddq_s16(q3s16, q4s16); + q15s16 = vaddq_s16(q2s16, q6s16); + STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) + STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) + // part of stage 6 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) + q13s16 = vsubq_s16(q0s16, q5s16); + q14s16 = vsubq_s16(q2s16, q6s16); + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) + + // ----------------------------------------- + // BLOCK D: 0-3,4-7 + // ----------------------------------------- + // generate 4,5,6,7 + // part of stage 3 + LOAD_FROM_TRANSPOSED(6, 4, 28) + DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) + LOAD_FROM_TRANSPOSED(28, 20, 12) + DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) + // part of stage 4 + q13s16 = vsubq_s16(q0s16, q1s16); + q0s16 = vaddq_s16(q0s16, q1s16); + q14s16 = vsubq_s16(q2s16, q3s16); + q2s16 = vaddq_s16(q2s16, q3s16); + // part of stage 5 + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) + + // generate 0,1,2,3 + // part of stage 4 + LOAD_FROM_TRANSPOSED(12, 0, 16) + DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) + LOAD_FROM_TRANSPOSED(16, 8, 24) + DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) + // part of stage 5 + q4s16 = vaddq_s16(q7s16, q6s16); + q7s16 = vsubq_s16(q7s16, q6s16); + q6s16 = vsubq_s16(q5s16, q14s16); + q5s16 = vaddq_s16(q5s16, q14s16); + // part of stage 6 + q8s16 = vaddq_s16(q4s16, q2s16); + q9s16 = vaddq_s16(q5s16, q3s16); + q10s16 = vaddq_s16(q6s16, q1s16); + q11s16 = vaddq_s16(q7s16, q0s16); + q12s16 = vsubq_s16(q7s16, q0s16); + q13s16 = vsubq_s16(q6s16, q1s16); + q14s16 = vsubq_s16(q5s16, q3s16); + q15s16 = vsubq_s16(q4s16, q2s16); + // part of stage 7 + LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) + q2s16 = vaddq_s16(q8s16, q1s16); + q3s16 = vaddq_s16(q9s16, q0s16); + q4s16 = vsubq_s16(q9s16, q0s16); + q5s16 = vsubq_s16(q8s16, q1s16); + LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) + q8s16 = vaddq_s16(q4s16, q1s16); + q9s16 = vaddq_s16(q5s16, q0s16); + q6s16 = vsubq_s16(q5s16, q0s16); + q7s16 = vsubq_s16(q4s16, q1s16); + + if (idct32_pass_loop == 0) { + idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, + q10s16, q11s16, q12s16, q13s16, q14s16, + q15s16); + } else { + idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16, + q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, + q14s16, q15s16); + dest += 8; + } + } + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm new file mode 100644 index 000000000..6bd733d5d --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm @@ -0,0 +1,71 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + + EXPORT |aom_idct4x4_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct4x4_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 4) + add r0, r0, #8 ; + (1 <<((4) - 1)) + asr r0, r0, #4 ; >> 4 + + vdup.s16 q0, r0 ; duplicate a1 + + vld1.32 {d2[0]}, [r1], r2 + vld1.32 {d2[1]}, [r1], r2 + vld1.32 {d4[0]}, [r1], r2 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q8, q0, d2 ; dest[x] + a1 + vaddw.u8 q9, q0, d4 + + vqmovun.s16 d6, q8 ; clip_pixel + vqmovun.s16 d7, q9 + + vst1.32 {d6[0]}, [r12], r2 + vst1.32 {d6[1]}, [r12], r2 + vst1.32 {d7[0]}, [r12], r2 + vst1.32 {d7[1]}, [r12] + + bx lr + ENDP ; |aom_idct4x4_1_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c new file mode 100644 index 000000000..3df7a901b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/inv_txfm.h" +#include "aom_ports/mem.h" + +void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d6u8; + uint32x2_t d2u32 = vdup_n_u32(0); + uint16x8_t q8u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + q0s16 = vdupq_n_s16(a1); + + // dc_only_idct_add + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); + d1 += dest_stride; + d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32)); + d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); + d2 += dest_stride; + vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); + d2 += dest_stride; + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm new file mode 100644 index 000000000..127acf614 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm @@ -0,0 +1,193 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_idct4x4_16_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + AREA Block, CODE, READONLY ; name this block of code +;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct4x4_16_add_neon| PROC + + ; The 2D transform is done with two passes which are actually pretty + ; similar. We first transform the rows. This is done by transposing + ; the inputs, doing an SIMD column transform (the columns are the + ; transposed rows) and then transpose the results (so that it goes back + ; in normal/row positions). Then, we transform the columns by doing + ; another SIMD column transform. + ; So, two passes of a transpose followed by a column transform. + + ; load the inputs into q8-q9, d16-d19 + vld1.s16 {q8,q9}, [r0]! + + ; generate scalar constants + ; cospi_8_64 = 15137 = 0x3b21 + mov r0, #0x3b00 + add r0, #0x21 + ; cospi_16_64 = 11585 = 0x2d41 + mov r3, #0x2d00 + add r3, #0x41 + ; cospi_24_64 = 6270 = 0x 187e + mov r12, #0x1800 + add r12, #0x7e + + ; transpose the input data + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + + ; generate constant vectors + vdup.16 d20, r0 ; replicate cospi_8_64 + vdup.16 d21, r3 ; replicate cospi_16_64 + + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + vdup.16 d22, r12 ; replicate cospi_24_64 + + ; do the transform on transposed rows + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + vswp d18, d19 + + ; transpose the results + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + ; do the transform on columns + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + + ; The results are in two registers, one of them being swapped. This will + ; be taken care of by loading the 'dest' value in a swapped fashion and + ; also storing them in the same swapped fashion. + ; temp_out[0, 1] = d16, d17 = q8 + ; temp_out[2, 3] = d19, d18 = q9 swapped + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + + vld1.32 {d26[0]}, [r1], r2 + vld1.32 {d26[1]}, [r1], r2 + vld1.32 {d27[1]}, [r1], r2 + vld1.32 {d27[0]}, [r1] ; no post-increment + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d26 + vaddw.u8 q9, q9, d27 + + ; clip_pixel + vqmovun.s16 d26, q8 + vqmovun.s16 d27, q9 + + ; do the stores in reverse order with negative post-increment, by changing + ; the sign of the stride + rsb r2, r2, #0 + vst1.32 {d27[0]}, [r1], r2 + vst1.32 {d27[1]}, [r1], r2 + vst1.32 {d26[1]}, [r1], r2 + vst1.32 {d26[0]}, [r1] ; no post-increment + bx lr + ENDP ; |aom_idct4x4_16_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c new file mode 100644 index 000000000..763be1ab0 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/txfm_common.h" + +void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d26u8, d27u8; + uint32x2_t d26u32, d27u32; + uint16x8_t q8u16, q9u16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; + int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; + int16x8_t q8s16, q9s16, q13s16, q14s16; + int32x4_t q1s32, q13s32, q14s32, q15s32; + int16x4x2_t d0x2s16, d1x2s16; + int32x4x2_t q0x2s32; + uint8_t *d; + + d26u32 = d27u32 = vdup_n_u32(0); + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_low_s16(q9s16); + d19s16 = vget_high_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + d20s16 = vdup_n_s16((int16_t)cospi_8_64); + d21s16 = vdup_n_s16((int16_t)cospi_16_64); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + d22s16 = vdup_n_s16((int16_t)cospi_24_64); + + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + d16s16 = vget_low_s16(q8s16); + d17s16 = vget_high_s16(q8s16); + d18s16 = vget_high_s16(q9s16); // vswp d18 d19 + d19s16 = vget_low_s16(q9s16); + + d0x2s16 = vtrn_s16(d16s16, d17s16); + d1x2s16 = vtrn_s16(d18s16, d19s16); + q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); + q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); + d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); + d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); + + // do the transform on columns + // stage 1 + d23s16 = vadd_s16(d16s16, d18s16); + d24s16 = vsub_s16(d16s16, d18s16); + + q15s32 = vmull_s16(d17s16, d22s16); + q1s32 = vmull_s16(d17s16, d20s16); + q13s32 = vmull_s16(d23s16, d21s16); + q14s32 = vmull_s16(d24s16, d21s16); + + q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); + q1s32 = vmlal_s16(q1s32, d19s16, d22s16); + + d26s16 = vqrshrn_n_s32(q13s32, 14); + d27s16 = vqrshrn_n_s32(q14s32, 14); + d29s16 = vqrshrn_n_s32(q15s32, 14); + d28s16 = vqrshrn_n_s32(q1s32, 14); + q13s16 = vcombine_s16(d26s16, d27s16); + q14s16 = vcombine_s16(d28s16, d29s16); + + // stage 2 + q8s16 = vaddq_s16(q13s16, q14s16); + q9s16 = vsubq_s16(q13s16, q14s16); + + q8s16 = vrshrq_n_s16(q8s16, 4); + q9s16 = vrshrq_n_s16(q9s16, 4); + + d = dest; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); + d += dest_stride; + d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); + d += dest_stride; + d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); + + d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + + d = dest; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); + d += dest_stride; + vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm new file mode 100644 index 000000000..ec07e2053 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm @@ -0,0 +1,91 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + + EXPORT |aom_idct8x8_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct8x8_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 5) + add r0, r0, #16 ; + (1 <<((5) - 1)) + asr r0, r0, #5 ; >> 5 + + vdup.s16 q0, r0 ; duplicate a1 + + ; load destination data + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r2 + vld1.64 {d17}, [r1] + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |aom_idct8x8_1_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c new file mode 100644 index 000000000..c7926f9e4 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/inv_txfm.h" +#include "aom_ports/mem.h" + +void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8x8_t d2u8, d3u8, d30u8, d31u8; + uint64x1_t d2u64, d3u64, d4u64, d5u64; + uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; + int16x8_t q0s16; + uint8_t *d1, *d2; + int16_t i, a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 5); + + q0s16 = vdupq_n_s16(a1); + q0u16 = vreinterpretq_u16_s16(q0s16); + + d1 = d2 = dest; + for (i = 0; i < 2; i++) { + d2u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d4u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + d5u64 = vld1_u64((const uint64_t *)d1); + d1 += dest_stride; + + q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); + q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); + q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); + q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); + + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); + d2 += dest_stride; + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm new file mode 100644 index 000000000..f3d5f246d --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm @@ -0,0 +1,522 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_idct8x8_64_add_neon| + EXPORT |aom_idct8x8_12_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are + ; loaded in q8-q15. The output will be stored back into q8-q15 registers. + ; This macro will touch q0-q7 registers and use them as buffer during + ; calculation. + MACRO + IDCT8x8_1D + ; stage 1 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r4 ; duplicate cospi_4_64 + vdup.16 d2, r5 ; duplicate cospi_12_64 + vdup.16 d3, r6 ; duplicate cospi_20_64 + + ; input[1] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; input[5] * cospi_12_64 + vmull.s16 q5, d26, d2 + vmull.s16 q6, d27, d2 + + ; input[1]*cospi_28_64-input[7]*cospi_4_64 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q5, d22, d3 + vmlsl.s16 q6, d23, d3 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q5, #14 ; >> 14 + vqrshrn.s32 d11, q6, #14 ; >> 14 + + ; input[1] * cospi_4_64 + vmull.s16 q2, d18, d1 + vmull.s16 q3, d19, d1 + + ; input[5] * cospi_20_64 + vmull.s16 q9, d26, d3 + vmull.s16 q13, d27, d3 + + ; input[1]*cospi_4_64+input[7]*cospi_28_64 + vmlal.s16 q2, d30, d0 + vmlal.s16 q3, d31, d0 + + ; input[5] * cospi_20_64 + input[3] * cospi_12_64 + vmlal.s16 q9, d22, d2 + vmlal.s16 q13, d23, d2 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d14, q2, #14 ; >> 14 + vqrshrn.s32 d15, q3, #14 ; >> 14 + + ; stage 2 & stage 3 - even half + vdup.16 d0, r7 ; duplicate cospi_16_64 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q13, #14 ; >> 14 + + ; input[0] * cospi_16_64 + vmull.s16 q2, d16, d0 + vmull.s16 q3, d17, d0 + + ; input[0] * cospi_16_64 + vmull.s16 q13, d16, d0 + vmull.s16 q15, d17, d0 + + ; (input[0] + input[2]) * cospi_16_64 + vmlal.s16 q2, d24, d0 + vmlal.s16 q3, d25, d0 + + ; (input[0] - input[2]) * cospi_16_64 + vmlsl.s16 q13, d24, d0 + vmlsl.s16 q15, d25, d0 + + vdup.16 d0, r8 ; duplicate cospi_24_64 + vdup.16 d1, r9 ; duplicate cospi_8_64 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d18, q2, #14 ; >> 14 + vqrshrn.s32 d19, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d22, q13, #14 ; >> 14 + vqrshrn.s32 d23, q15, #14 ; >> 14 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + ; input[1] * cospi_24_64 + vmull.s16 q2, d20, d0 + vmull.s16 q3, d21, d0 + + ; input[1] * cospi_8_64 + vmull.s16 q8, d20, d1 + vmull.s16 q12, d21, d1 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlsl.s16 q2, d28, d1 + vmlsl.s16 q3, d29, d1 + + ; input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q8, d28, d0 + vmlal.s16 q12, d29, d0 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d26, q2, #14 ; >> 14 + vqrshrn.s32 d27, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d30, q8, #14 ; >> 14 + vqrshrn.s32 d31, q12, #14 ; >> 14 + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; + MEND + + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct8x8_64_add_neon| PROC + push {r4-r9} + vpush {d8-d15} + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! + + ; transpose the input data + TRANSPOSE8X8 + + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + + ; First transform rows + IDCT8x8_1D + + ; Transpose the matrix + TRANSPOSE8X8 + + ; Then transform columns + IDCT8x8_1D + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + + vpop {d8-d15} + pop {r4-r9} + bx lr + ENDP ; |aom_idct8x8_64_add_neon| + +;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct8x8_12_add_neon| PROC + push {r4-r9} + vpush {d8-d15} + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! + + ; transpose the input data + TRANSPOSE8X8 + + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + + ; First transform rows + ; stage 1 + ; The following instructions use vqrdmulh to do the + ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling + ; multiply and shift the result by 16 bits instead of 14 bits. So we need + ; to double the constants before multiplying to compensate this. + mov r12, r3, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_28_64*2 + mov r12, r4, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_4_64*2 + + ; dct_const_round_shift(input[1] * cospi_28_64) + vqrdmulh.s16 q4, q9, q0 + + mov r12, r6, lsl #1 + rsb r12, #0 + vdup.16 q0, r12 ; duplicate -cospi_20_64*2 + + ; dct_const_round_shift(input[1] * cospi_4_64) + vqrdmulh.s16 q7, q9, q1 + + mov r12, r5, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_12_64*2 + + ; dct_const_round_shift(- input[3] * cospi_20_64) + vqrdmulh.s16 q5, q11, q0 + + mov r12, r7, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_16_64*2 + + ; dct_const_round_shift(input[3] * cospi_12_64) + vqrdmulh.s16 q6, q11, q1 + + ; stage 2 & stage 3 - even half + mov r12, r8, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_24_64*2 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrdmulh.s16 q9, q8, q0 + + mov r12, r9, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_8_64*2 + + ; dct_const_round_shift(input[1] * cospi_24_64) + vqrdmulh.s16 q13, q10, q1 + + ; dct_const_round_shift(input[1] * cospi_8_64) + vqrdmulh.s16 q15, q10, q0 + + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; + + ; Transpose the matrix + TRANSPOSE8X8 + + ; Then transform columns + IDCT8x8_1D + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + + vpop {d8-d15} + pop {r4-r9} + bx lr + ENDP ; |aom_idct8x8_12_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c new file mode 100644 index 000000000..8ad70862d --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c @@ -0,0 +1,509 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "aom_dsp/txfm_common.h" + +static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, + int16x8_t *q10s16, int16x8_t *q11s16, + int16x8_t *q12s16, int16x8_t *q13s16, + int16x8_t *q14s16, int16x8_t *q15s16) { + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; + int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 + *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 + *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 + *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 + *q12s16 = vcombine_s16(d17s16, d25s16); + *q13s16 = vcombine_s16(d19s16, d27s16); + *q14s16 = vcombine_s16(d21s16, d29s16); + *q15s16 = vcombine_s16(d23s16, d31s16); + + q0x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); + q1x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); + q2x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); + q3x2s32 = + vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); + + q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 + vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 + q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 + vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 + q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 + vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 + q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 + vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 + + *q8s16 = q0x2s16.val[0]; + *q9s16 = q0x2s16.val[1]; + *q10s16 = q1x2s16.val[0]; + *q11s16 = q1x2s16.val[1]; + *q12s16 = q2x2s16.val[0]; + *q13s16 = q2x2s16.val[1]; + *q14s16 = q3x2s16.val[0]; + *q15s16 = q3x2s16.val[1]; + return; +} + +static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, + int16x8_t *q10s16, int16x8_t *q11s16, + int16x8_t *q12s16, int16x8_t *q13s16, + int16x8_t *q14s16, int16x8_t *q15s16) { + int16x4_t d0s16, d1s16, d2s16, d3s16; + int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; + int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; + int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; + int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; + + d0s16 = vdup_n_s16((int16_t)cospi_28_64); + d1s16 = vdup_n_s16((int16_t)cospi_4_64); + d2s16 = vdup_n_s16((int16_t)cospi_12_64); + d3s16 = vdup_n_s16((int16_t)cospi_20_64); + + d16s16 = vget_low_s16(*q8s16); + d17s16 = vget_high_s16(*q8s16); + d18s16 = vget_low_s16(*q9s16); + d19s16 = vget_high_s16(*q9s16); + d20s16 = vget_low_s16(*q10s16); + d21s16 = vget_high_s16(*q10s16); + d22s16 = vget_low_s16(*q11s16); + d23s16 = vget_high_s16(*q11s16); + d24s16 = vget_low_s16(*q12s16); + d25s16 = vget_high_s16(*q12s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + d30s16 = vget_low_s16(*q15s16); + d31s16 = vget_high_s16(*q15s16); + + q2s32 = vmull_s16(d18s16, d0s16); + q3s32 = vmull_s16(d19s16, d0s16); + q5s32 = vmull_s16(d26s16, d2s16); + q6s32 = vmull_s16(d27s16, d2s16); + + q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); + q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); + q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); + + d8s16 = vqrshrn_n_s32(q2s32, 14); + d9s16 = vqrshrn_n_s32(q3s32, 14); + d10s16 = vqrshrn_n_s32(q5s32, 14); + d11s16 = vqrshrn_n_s32(q6s32, 14); + q4s16 = vcombine_s16(d8s16, d9s16); + q5s16 = vcombine_s16(d10s16, d11s16); + + q2s32 = vmull_s16(d18s16, d1s16); + q3s32 = vmull_s16(d19s16, d1s16); + q9s32 = vmull_s16(d26s16, d3s16); + q13s32 = vmull_s16(d27s16, d3s16); + + q2s32 = vmlal_s16(q2s32, d30s16, d0s16); + q3s32 = vmlal_s16(q3s32, d31s16, d0s16); + q9s32 = vmlal_s16(q9s32, d22s16, d2s16); + q13s32 = vmlal_s16(q13s32, d23s16, d2s16); + + d14s16 = vqrshrn_n_s32(q2s32, 14); + d15s16 = vqrshrn_n_s32(q3s32, 14); + d12s16 = vqrshrn_n_s32(q9s32, 14); + d13s16 = vqrshrn_n_s32(q13s32, 14); + q6s16 = vcombine_s16(d12s16, d13s16); + q7s16 = vcombine_s16(d14s16, d15s16); + + d0s16 = vdup_n_s16((int16_t)cospi_16_64); + + q2s32 = vmull_s16(d16s16, d0s16); + q3s32 = vmull_s16(d17s16, d0s16); + q13s32 = vmull_s16(d16s16, d0s16); + q15s32 = vmull_s16(d17s16, d0s16); + + q2s32 = vmlal_s16(q2s32, d24s16, d0s16); + q3s32 = vmlal_s16(q3s32, d25s16, d0s16); + q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); + q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); + + d0s16 = vdup_n_s16((int16_t)cospi_24_64); + d1s16 = vdup_n_s16((int16_t)cospi_8_64); + + d18s16 = vqrshrn_n_s32(q2s32, 14); + d19s16 = vqrshrn_n_s32(q3s32, 14); + d22s16 = vqrshrn_n_s32(q13s32, 14); + d23s16 = vqrshrn_n_s32(q15s32, 14); + *q9s16 = vcombine_s16(d18s16, d19s16); + *q11s16 = vcombine_s16(d22s16, d23s16); + + q2s32 = vmull_s16(d20s16, d0s16); + q3s32 = vmull_s16(d21s16, d0s16); + q8s32 = vmull_s16(d20s16, d1s16); + q12s32 = vmull_s16(d21s16, d1s16); + + q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); + q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); + q8s32 = vmlal_s16(q8s32, d28s16, d0s16); + q12s32 = vmlal_s16(q12s32, d29s16, d0s16); + + d26s16 = vqrshrn_n_s32(q2s32, 14); + d27s16 = vqrshrn_n_s32(q3s32, 14); + d30s16 = vqrshrn_n_s32(q8s32, 14); + d31s16 = vqrshrn_n_s32(q12s32, 14); + *q13s16 = vcombine_s16(d26s16, d27s16); + *q15s16 = vcombine_s16(d30s16, d31s16); + + q0s16 = vaddq_s16(*q9s16, *q15s16); + q1s16 = vaddq_s16(*q11s16, *q13s16); + q2s16 = vsubq_s16(*q11s16, *q13s16); + q3s16 = vsubq_s16(*q9s16, *q15s16); + + *q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + *q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(*q13s16); + d27s16 = vget_high_s16(*q13s16); + d28s16 = vget_low_s16(*q14s16); + d29s16 = vget_high_s16(*q14s16); + + d16s16 = vdup_n_s16((int16_t)cospi_16_64); + + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + *q8s16 = vaddq_s16(q0s16, q7s16); + *q9s16 = vaddq_s16(q1s16, q6s16); + *q10s16 = vaddq_s16(q2s16, q5s16); + *q11s16 = vaddq_s16(q3s16, q4s16); + *q12s16 = vsubq_s16(q3s16, q4s16); + *q13s16 = vsubq_s16(q2s16, q5s16); + *q14s16 = vsubq_s16(q1s16, q6s16); + *q15s16 = vsubq_s16(q0s16, q7s16); + return; +} + +void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; +} + +void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { + uint8_t *d1, *d2; + uint8x8_t d0u8, d1u8, d2u8, d3u8; + int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; + int16x4_t d26s16, d27s16, d28s16, d29s16; + uint64x1_t d0u64, d1u64, d2u64, d3u64; + int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; + int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + uint16x8_t q8u16, q9u16, q10u16, q11u16; + int32x4_t q9s32, q10s32, q11s32, q12s32; + + q8s16 = vld1q_s16(input); + q9s16 = vld1q_s16(input + 8); + q10s16 = vld1q_s16(input + 16); + q11s16 = vld1q_s16(input + 24); + q12s16 = vld1q_s16(input + 32); + q13s16 = vld1q_s16(input + 40); + q14s16 = vld1q_s16(input + 48); + q15s16 = vld1q_s16(input + 56); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + // First transform rows + // stage 1 + q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2); + q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2); + + q4s16 = vqrdmulhq_s16(q9s16, q0s16); + + q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2); + + q7s16 = vqrdmulhq_s16(q9s16, q1s16); + + q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2); + + q5s16 = vqrdmulhq_s16(q11s16, q0s16); + + q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2); + + q6s16 = vqrdmulhq_s16(q11s16, q1s16); + + // stage 2 & stage 3 - even half + q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2); + + q9s16 = vqrdmulhq_s16(q8s16, q0s16); + + q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2); + + q13s16 = vqrdmulhq_s16(q10s16, q1s16); + + q15s16 = vqrdmulhq_s16(q10s16, q0s16); + + // stage 3 -odd half + q0s16 = vaddq_s16(q9s16, q15s16); + q1s16 = vaddq_s16(q9s16, q13s16); + q2s16 = vsubq_s16(q9s16, q13s16); + q3s16 = vsubq_s16(q9s16, q15s16); + + // stage 2 - odd half + q13s16 = vsubq_s16(q4s16, q5s16); + q4s16 = vaddq_s16(q4s16, q5s16); + q14s16 = vsubq_s16(q7s16, q6s16); + q7s16 = vaddq_s16(q7s16, q6s16); + d26s16 = vget_low_s16(q13s16); + d27s16 = vget_high_s16(q13s16); + d28s16 = vget_low_s16(q14s16); + d29s16 = vget_high_s16(q14s16); + + d16s16 = vdup_n_s16((int16_t)cospi_16_64); + q9s32 = vmull_s16(d28s16, d16s16); + q10s32 = vmull_s16(d29s16, d16s16); + q11s32 = vmull_s16(d28s16, d16s16); + q12s32 = vmull_s16(d29s16, d16s16); + + q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); + q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); + q11s32 = vmlal_s16(q11s32, d26s16, d16s16); + q12s32 = vmlal_s16(q12s32, d27s16, d16s16); + + d10s16 = vqrshrn_n_s32(q9s32, 14); + d11s16 = vqrshrn_n_s32(q10s32, 14); + d12s16 = vqrshrn_n_s32(q11s32, 14); + d13s16 = vqrshrn_n_s32(q12s32, 14); + q5s16 = vcombine_s16(d10s16, d11s16); + q6s16 = vcombine_s16(d12s16, d13s16); + + // stage 4 + q8s16 = vaddq_s16(q0s16, q7s16); + q9s16 = vaddq_s16(q1s16, q6s16); + q10s16 = vaddq_s16(q2s16, q5s16); + q11s16 = vaddq_s16(q3s16, q4s16); + q12s16 = vsubq_s16(q3s16, q4s16); + q13s16 = vsubq_s16(q2s16, q5s16); + q14s16 = vsubq_s16(q1s16, q6s16); + q15s16 = vsubq_s16(q0s16, q7s16); + + TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); + + q8s16 = vrshrq_n_s16(q8s16, 5); + q9s16 = vrshrq_n_s16(q9s16, 5); + q10s16 = vrshrq_n_s16(q10s16, 5); + q11s16 = vrshrq_n_s16(q11s16, 5); + q12s16 = vrshrq_n_s16(q12s16, 5); + q13s16 = vrshrq_n_s16(q13s16, 5); + q14s16 = vrshrq_n_s16(q14s16, 5); + q15s16 = vrshrq_n_s16(q15s16, 5); + + d1 = d2 = dest; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + + q8s16 = q12s16; + q9s16 = q13s16; + q10s16 = q14s16; + q11s16 = q15s16; + + d0u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d1u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d2u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + d3u64 = vld1_u64((uint64_t *)d1); + d1 += dest_stride; + + q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); + q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); + q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); + q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); + + d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); + d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); + d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); + + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); + d2 += dest_stride; + vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); + d2 += dest_stride; + return; +} diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c new file mode 100644 index 000000000..2dc5b2e56 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +//------------------------------------------------------------------------------ +// DC 4x4 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_top = vcombine_u16(p1, p1); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + sum_left = vcombine_u16(p1, p1); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 3); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 2); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 2); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 4; ++i) { + vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0); + } + } +} + +void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_4x4(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_4x4(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_4x4(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_4x4(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x8_t A = vld1_u8(above); // top row + const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_top = vcombine_u16(p2, p2); + } + + if (do_left) { + const uint8x8_t L = vld1_u8(left); // left border + const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left + const uint16x4_t p1 = vpadd_u16(p0, p0); + const uint16x4_t p2 = vpadd_u16(p1, p1); + sum_left = vcombine_u16(p2, p2); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 4); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 3); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 3); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x8_t dc = vdup_lane_u8(dc0, 0); + int i; + for (i = 0; i < 8; ++i) { + vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc)); + } + } +} + +void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_8x8(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + dc_8x8(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + dc_8x8(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + dc_8x8(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A = vld1q_u8(above); // top row + const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_top = vcombine_u16(p3, p3); + } + + if (do_left) { + const uint8x16_t L = vld1q_u8(left); // left row + const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left + const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + const uint16x4_t p2 = vpadd_u16(p1, p1); + const uint16x4_t p3 = vpadd_u16(p2, p2); + sum_left = vcombine_u16(p3, p3); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 5); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 4); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 4); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 16; ++i) { + vst1q_u8(dst + i * stride, dc); + } + } +} + +void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_16x16(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_16x16(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_16x16(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_16x16(dst, stride, NULL, NULL, 0, 0); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +// 'do_above' and 'do_left' facilitate branch removal when inlined. +static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int do_above, int do_left) { + uint16x8_t sum_top; + uint16x8_t sum_left; + uint8x8_t dc0; + + if (do_above) { + const uint8x16_t A0 = vld1q_u8(above); // top row + const uint8x16_t A1 = vld1q_u8(above + 16); + const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top + const uint16x8_t p1 = vpaddlq_u8(A1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_top = vcombine_u16(p5, p5); + } + + if (do_left) { + const uint8x16_t L0 = vld1q_u8(left); // left row + const uint8x16_t L1 = vld1q_u8(left + 16); + const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left + const uint16x8_t p1 = vpaddlq_u8(L1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + const uint16x4_t p4 = vpadd_u16(p3, p3); + const uint16x4_t p5 = vpadd_u16(p4, p4); + sum_left = vcombine_u16(p5, p5); + } + + if (do_above && do_left) { + const uint16x8_t sum = vaddq_u16(sum_left, sum_top); + dc0 = vrshrn_n_u16(sum, 6); + } else if (do_above) { + dc0 = vrshrn_n_u16(sum_top, 5); + } else if (do_left) { + dc0 = vrshrn_n_u16(sum_left, 5); + } else { + dc0 = vdup_n_u8(0x80); + } + + { + const uint8x16_t dc = vdupq_lane_u8(dc0, 0); + int i; + for (i = 0; i < 32; ++i) { + vst1q_u8(dst + i * stride, dc); + vst1q_u8(dst + i * stride + 16, dc); + } + } +} + +void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_32x32(dst, stride, above, left, 1, 1); +} + +void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + dc_32x32(dst, stride, NULL, left, 0, 1); +} + +void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + dc_32x32(dst, stride, above, NULL, 1, 0); +} + +void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + dc_32x32(dst, stride, NULL, NULL, 0, 0); +} + +// ----------------------------------------------------------------------------- + +void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XABCD_u8 = vld1_u8(above - 1); + const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); + const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); + const uint32x2_t zero = vdup_n_u32(0); + const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); + const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); + const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); + const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); + const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); + const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); + const uint8_t D = vget_lane_u8(XABCD_u8, 4); + const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); + const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); + const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); + const uint32x2_t r3 = vreinterpret_u32_u8(avg2); + const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); + const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); + const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); + vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); + vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); + vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); +} + +#if !HAVE_NEON_ASM + +void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint32x2_t d0u32 = vdup_n_u32(0); + (void)left; + + d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); + for (i = 0; i < 4; i++, dst += stride) + vst1_lane_u32((uint32_t *)dst, d0u32, 0); +} + +void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x8_t d0u8 = vdup_n_u8(0); + (void)left; + + d0u8 = vld1_u8(above); + for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); +} + +void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8); +} + +void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)left; + + q0u8 = vld1q_u8(above); + q1u8 = vld1q_u8(above + 16); + for (i = 0; i < 32; i++, dst += stride) { + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q1u8); + } +} + +void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d1u32 = vdup_n_u32(0); + (void)above; + + d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); +} + +void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + uint8x8_t d0u8 = vdup_n_u8(0); + uint64x1_t d1u64 = vdup_n_u64(0); + (void)above; + + d1u64 = vld1_u64((const uint64_t *)left); + + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); + vst1_u8(dst, d0u8); + dst += stride; + d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); + vst1_u8(dst, d0u8); +} + +void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + dst += stride; + } +} + +void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint8x8_t d2u8 = vdup_n_u8(0); + uint8x16_t q0u8 = vdupq_n_u8(0); + uint8x16_t q1u8 = vdupq_n_u8(0); + (void)above; + + for (k = 0; k < 2; k++, left += 16) { + q1u8 = vld1q_u8(left); + d2u8 = vget_low_u8(q1u8); + for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { + q0u8 = vdupq_lane_u8(d2u8, 0); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 1); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 2); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 3); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 4); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 5); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 6); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + q0u8 = vdupq_lane_u8(d2u8, 7); + vst1q_u8(dst, q0u8); + vst1q_u8(dst + 16, q0u8); + dst += stride; + } + } +} + +void aom_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int i; + uint16x8_t q1u16, q3u16; + int16x8_t q1s16; + uint8x8_t d0u8 = vdup_n_u8(0); + uint32x2_t d2u32 = vdup_n_u32(0); + + d0u8 = vld1_dup_u8(above - 1); + d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); + q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); + for (i = 0; i < 4; i++, dst += stride) { + q1u16 = vdupq_n_u16((uint16_t)left[i]); + q1s16 = + vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); + d0u8 = vqmovun_s16(q1s16); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + } +} + +void aom_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j; + uint16x8_t q0u16, q3u16, q10u16; + int16x8_t q0s16; + uint16x4_t d20u16; + uint8x8_t d0u8, d2u8, d30u8; + + d0u8 = vld1_dup_u8(above - 1); + d30u8 = vld1_u8(left); + d2u8 = vld1_u8(above); + q10u16 = vmovl_u8(d30u8); + q3u16 = vsubl_u8(d2u8, d0u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 1); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 2); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + q0u16 = vdupq_lane_u16(d20u16, 3); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); + d0u8 = vqmovun_s16(q0s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); + dst += stride; + } +} + +void aom_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; + uint8x16_t q0u8, q1u8; + int16x8_t q0s16, q1s16, q8s16, q11s16; + uint16x4_t d20u16; + uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; + + q0u8 = vld1q_dup_u8(above - 1); + q1u8 = vld1q_u8(above); + q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + for (k = 0; k < 2; k++, left += 8) { + d18u8 = vld1_u8(left); + q10u16 = vmovl_u8(d18u8); + d20u16 = vget_low_u16(q10u16); + for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { + q0u16 = vdupq_lane_u16(d20u16, 0); + q8u16 = vdupq_lane_u16(d20u16, 1); + q1s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); + q11s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); + q8s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d20u16, 2); + q8u16 = vdupq_lane_u16(d20u16, 3); + q1s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); + q0s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); + q11s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); + q8s16 = + vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); + d2u8 = vqmovun_s16(q1s16); + d3u8 = vqmovun_s16(q0s16); + d22u8 = vqmovun_s16(q11s16); + d23u8 = vqmovun_s16(q8s16); + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); + dst += stride; + vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); + vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); + dst += stride; + } + } +} + +void aom_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int j, k; + uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; + uint8x16_t q0u8, q1u8, q2u8; + int16x8_t q12s16, q13s16, q14s16, q15s16; + uint16x4_t d6u16; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; + + q0u8 = vld1q_dup_u8(above - 1); + q1u8 = vld1q_u8(above); + q2u8 = vld1q_u8(above + 16); + q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); + q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); + q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); + q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); + for (k = 0; k < 4; k++, left += 8) { + d26u8 = vld1_u8(left); + q3u16 = vmovl_u8(d26u8); + d6u16 = vget_low_u16(q3u16); + for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { + q0u16 = vdupq_lane_u16(d6u16, 0); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 1); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 2); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + + q0u16 = vdupq_lane_u16(d6u16, 3); + q12s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); + q13s16 = + vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); + q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q10u16)); + q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), + vreinterpretq_s16_u16(q11u16)); + d0u8 = vqmovun_s16(q12s16); + d1u8 = vqmovun_s16(q13s16); + d2u8 = vqmovun_s16(q14s16); + d3u8 = vqmovun_s16(q15s16); + q0u8 = vcombine_u8(d0u8, d1u8); + q1u8 = vcombine_u8(d2u8, d3u8); + vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); + vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); + dst += stride; + } + } +} +#endif // !HAVE_NEON_ASM diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm new file mode 100644 index 000000000..7d04d3553 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm @@ -0,0 +1,633 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_v_predictor_4x4_neon| + EXPORT |aom_v_predictor_8x8_neon| + EXPORT |aom_v_predictor_16x16_neon| + EXPORT |aom_v_predictor_32x32_neon| + EXPORT |aom_h_predictor_4x4_neon| + EXPORT |aom_h_predictor_8x8_neon| + EXPORT |aom_h_predictor_16x16_neon| + EXPORT |aom_h_predictor_32x32_neon| + EXPORT |aom_tm_predictor_4x4_neon| + EXPORT |aom_tm_predictor_8x8_neon| + EXPORT |aom_tm_predictor_16x16_neon| + EXPORT |aom_tm_predictor_32x32_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_v_predictor_4x4_neon| PROC + vld1.32 {d0[0]}, [r2] + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |aom_v_predictor_4x4_neon| + +;void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_v_predictor_8x8_neon| PROC + vld1.8 {d0}, [r2] + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + vst1.8 {d0}, [r0], r1 + bx lr + ENDP ; |aom_v_predictor_8x8_neon| + +;void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_v_predictor_16x16_neon| PROC + vld1.8 {q0}, [r2] + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |aom_v_predictor_16x16_neon| + +;void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_v_predictor_32x32_neon| PROC + vld1.8 {q0, q1}, [r2] + mov r2, #2 +loop_v + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + vst1.8 {q0, q1}, [r0], r1 + subs r2, r2, #1 + bgt loop_v + bx lr + ENDP ; |aom_v_predictor_32x32_neon| + +;void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_h_predictor_4x4_neon| PROC + vld1.32 {d1[0]}, [r3] + vdup.8 d0, d1[0] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[1] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[2] + vst1.32 {d0[0]}, [r0], r1 + vdup.8 d0, d1[3] + vst1.32 {d0[0]}, [r0], r1 + bx lr + ENDP ; |aom_h_predictor_4x4_neon| + +;void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_h_predictor_8x8_neon| PROC + vld1.64 {d1}, [r3] + vdup.8 d0, d1[0] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[1] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[2] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[3] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[4] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[5] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[6] + vst1.64 {d0}, [r0], r1 + vdup.8 d0, d1[7] + vst1.64 {d0}, [r0], r1 + bx lr + ENDP ; |aom_h_predictor_8x8_neon| + +;void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_h_predictor_16x16_neon| PROC + vld1.8 {q1}, [r3] + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0], r1 + bx lr + ENDP ; |aom_h_predictor_16x16_neon| + +;void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_h_predictor_32x32_neon| PROC + sub r1, r1, #16 + mov r2, #2 +loop_h + vld1.8 {q1}, [r3]! + vdup.8 q0, d2[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d2[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[0] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[1] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[2] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[3] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[4] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[5] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[6] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + vdup.8 q0, d3[7] + vst1.8 {q0}, [r0]! + vst1.8 {q0}, [r0], r1 + subs r2, r2, #1 + bgt loop_h + bx lr + ENDP ; |aom_h_predictor_32x32_neon| + +;void aom_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_tm_predictor_4x4_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.u8 {d0[]}, [r12] + + ; Load above 4 pixels + vld1.32 {d2[0]}, [r2] + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3]! + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + + ; 3rd row and 4th row + vld1.u8 {d2[]}, [r3]! + vld1.u8 {d4[]}, [r3] + vmovl.u8 q1, d2 + vmovl.u8 q2, d4 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqmovun.s16 d0, q1 + vqmovun.s16 d1, q2 + vst1.32 {d0[0]}, [r0], r1 + vst1.32 {d1[0]}, [r0], r1 + bx lr + ENDP ; |aom_tm_predictor_4x4_neon| + +;void aom_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_tm_predictor_8x8_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; preload 8 left + vld1.8 {d30}, [r3] + + ; Load above 8 pixels + vld1.64 {d2}, [r2] + + vmovl.u8 q10, d30 + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + vdup.16 q0, d20[0] + vdup.16 q1, d20[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 3rd row and 4th row + vdup.16 q8, d20[2] + vdup.16 q9, d20[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + ; 5th row and 6th row + vdup.16 q0, d21[0] + vdup.16 q1, d21[1] + vadd.s16 q0, q3, q0 + vadd.s16 q1, q3, q1 + + ; 7th row and 8th row + vdup.16 q8, d21[2] + vdup.16 q9, d21[3] + vadd.s16 q8, q3, q8 + vadd.s16 q9, q3, q9 + + vqmovun.s16 d0, q0 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q8 + vqmovun.s16 d3, q9 + + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + vst1.64 {d2}, [r0], r1 + vst1.64 {d3}, [r0], r1 + + bx lr + ENDP ; |aom_tm_predictor_8x8_neon| + +;void aom_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_tm_predictor_16x16_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; Load above 8 pixels + vld1.8 {q1}, [r2] + + ; preload 8 left into r12 + vld1.8 {d18}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q2, d2, d0 + vsubl.u8 q3, d3, d0 + + vmovl.u8 q10, d18 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 2 times to process 16 rows. + mov r2, #2 + +loop_16x16_neon + ; Process two rows. + vdup.16 q0, d20[0] + vdup.16 q8, d20[1] + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d20[2] ; proload next 2 rows data + vdup.16 q8, d20[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + ; Process two rows. + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[0] ; proload next 2 rows data + vdup.16 q8, d21[1] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vdup.16 q0, d21[2] ; proload next 2 rows data + vdup.16 q8, d21[3] + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + + vadd.s16 q1, q0, q2 + vadd.s16 q0, q0, q3 + vadd.s16 q11, q8, q2 + vadd.s16 q8, q8, q3 + vqmovun.s16 d2, q1 + vqmovun.s16 d3, q0 + vqmovun.s16 d22, q11 + vqmovun.s16 d23, q8 + vld1.8 {d18}, [r3]! ; preload 8 left into r12 + vmovl.u8 q10, d18 + vst1.64 {d2,d3}, [r0], r1 + vst1.64 {d22,d23}, [r0], r1 + + subs r2, r2, #1 + bgt loop_16x16_neon + + bx lr + ENDP ; |aom_tm_predictor_16x16_neon| + +;void aom_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|aom_tm_predictor_32x32_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + vld1.8 {d0[]}, [r12] + + ; Load above 32 pixels + vld1.8 {q1}, [r2]! + vld1.8 {q2}, [r2] + + ; preload 8 left pixels + vld1.8 {d26}, [r3]! + + ; Compute above - ytop_left + vsubl.u8 q8, d2, d0 + vsubl.u8 q9, d3, d0 + vsubl.u8 q10, d4, d0 + vsubl.u8 q11, d5, d0 + + vmovl.u8 q3, d26 + + ; Load left row by row and compute left + (above - ytop_left) + ; Process 8 rows in each single loop and loop 4 times to process 32 rows. + mov r2, #4 + +loop_32x32_neon + ; Process two rows. + vdup.16 q0, d6[0] + vdup.16 q2, d6[1] + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q1, d6[2] + vdup.16 q2, d6[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q1, q8 + vadd.s16 q13, q1, q9 + vadd.s16 q14, q1, q10 + vadd.s16 q15, q1, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[0] + vdup.16 q2, d7[1] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vdup.16 q0, d7[2] + vdup.16 q2, d7[3] + vst1.64 {d24-d27}, [r0], r1 + + ; Process two rows. + vadd.s16 q12, q0, q8 + vadd.s16 q13, q0, q9 + vadd.s16 q14, q0, q10 + vadd.s16 q15, q0, q11 + vqmovun.s16 d0, q12 + vqmovun.s16 d1, q13 + vadd.s16 q12, q2, q8 + vadd.s16 q13, q2, q9 + vqmovun.s16 d2, q14 + vqmovun.s16 d3, q15 + vadd.s16 q14, q2, q10 + vadd.s16 q15, q2, q11 + vst1.64 {d0-d3}, [r0], r1 + vqmovun.s16 d24, q12 + vqmovun.s16 d25, q13 + vld1.8 {d0}, [r3]! ; preload 8 left pixels + vqmovun.s16 d26, q14 + vqmovun.s16 d27, q15 + vmovl.u8 q3, d0 + vst1.64 {d24-d27}, [r0], r1 + + subs r2, r2, #1 + bgt loop_32x32_neon + + bx lr + ENDP ; |aom_tm_predictor_32x32_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm new file mode 100644 index 000000000..b6e2c9edb --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm @@ -0,0 +1,202 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_4_dual_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp+4 const uint8_t *blimit1, +; sp+8 const uint8_t *limit1, +; sp+12 const uint8_t *thresh1, + +|aom_lpf_horizontal_4_dual_neon| PROC + push {lr} + + ldr r12, [sp, #4] ; load thresh0 + vld1.8 {d0}, [r2] ; load blimit0 to first half q + vld1.8 {d2}, [r3] ; load limit0 to first half q + + add r1, r1, r1 ; double pitch + ldr r2, [sp, #8] ; load blimit1 + + vld1.8 {d4}, [r12] ; load thresh0 to first half q + + ldr r3, [sp, #12] ; load limit1 + ldr r12, [sp, #16] ; load thresh1 + vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + + sub r2, r0, r1, lsl #1 ; s[-4 * p] + + vld1.8 {d3}, [r3] ; load limit1 to 2nd half q + vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + + vpush {d8-d15} ; save neon registers + + add r3, r2, r1, lsr #1 ; s[-3 * p] + + vld1.u8 {q3}, [r2@64], r1 ; p3 + vld1.u8 {q4}, [r3@64], r1 ; p2 + vld1.u8 {q5}, [r2@64], r1 ; p1 + vld1.u8 {q6}, [r3@64], r1 ; p0 + vld1.u8 {q7}, [r2@64], r1 ; q0 + vld1.u8 {q8}, [r3@64], r1 ; q1 + vld1.u8 {q9}, [r2@64] ; q2 + vld1.u8 {q10}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl aom_loop_filter_neon_16 + + vst1.u8 {q5}, [r2@64], r1 ; store op1 + vst1.u8 {q6}, [r3@64], r1 ; store op0 + vst1.u8 {q7}, [r2@64], r1 ; store oq0 + vst1.u8 {q8}, [r3@64], r1 ; store oq1 + + vpop {d8-d15} ; restore neon registers + + pop {pc} + ENDP ; |aom_lpf_horizontal_4_dual_neon| + +; void aom_loop_filter_neon_16(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. This function uses +; registers d8-d15, so the calling function must save those registers. +; +; r0-r3, r12 PRESERVE +; q0 blimit +; q1 limit +; q2 thresh +; q3 p3 +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 +; +; Outputs: +; q5 op1 +; q6 op0 +; q7 oq0 +; q8 oq1 +|aom_loop_filter_neon_16| PROC + + ; filter_mask + vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) + vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) + vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) + vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) + vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) + vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) + vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) + + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) + + vmov.u8 q10, #0x80 + + vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) + + vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 + + veor q7, q7, q10 ; qs0 + + vcge.u8 q15, q1, q15 ; abs(m11) > limit + + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 + + vmov.u16 q4, #3 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vcge.u8 q9, q0, q9 ; a > blimit + + vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) + vorr q14, q13, q14 ; hev + + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; filter &= hev + vand q15, q15, q9 ; mask + + vmov.u8 q4, #3 + + vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) + vaddw.s8 q11, q11, d3 + + vmov.u8 q9, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q11 + vand q1, q1, q15 ; filter &= mask + + vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) + vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) + vshr.s8 q2, q2, #3 ; filter2 >>= 3 + vshr.s8 q1, q1, #3 ; filter1 >>= 3 + + + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) + vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 + + veor q7, q0, q10 ; *oq0 = u^0x80 + + vbic q1, q1, q14 ; filter &= ~hev + + vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) + vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) + + veor q6, q11, q10 ; *op0 = u^0x80 + veor q5, q13, q10 ; *op1 = u^0x80 + veor q8, q12, q10 ; *oq1 = u^0x80 + + bx lr + ENDP ; |aom_loop_filter_neon_16| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c new file mode 100644 index 000000000..c0562a6ea --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" +#include "aom/aom_integer.h" + +static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // aom_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // aom_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vdupq_n_u16(3); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q4 = vdupq_n_u8(3); + q9 = vdupq_n_u8(4); + // aom_filter = clamp(aom_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); + *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); + return; +} + +void aom_lpf_horizontal_4_dual_neon( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; + uint8x16_t qblimit, qlimit, qthresh; + uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; + + dblimit0 = vld1_u8(blimit0); + dlimit0 = vld1_u8(limit0); + dthresh0 = vld1_u8(thresh0); + dblimit1 = vld1_u8(blimit1); + dlimit1 = vld1_u8(limit1); + dthresh1 = vld1_u8(thresh1); + qblimit = vcombine_u8(dblimit0, dblimit1); + qlimit = vcombine_u8(dlimit0, dlimit1); + qthresh = vcombine_u8(dthresh0, dthresh1); + + s -= (p << 2); + + q3u8 = vld1q_u8(s); + s += p; + q4u8 = vld1q_u8(s); + s += p; + q5u8 = vld1q_u8(s); + s += p; + q6u8 = vld1q_u8(s); + s += p; + q7u8 = vld1q_u8(s); + s += p; + q8u8 = vld1q_u8(s); + s += p; + q9u8 = vld1q_u8(s); + s += p; + q10u8 = vld1q_u8(s); + + loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8, + q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8); + + s -= (p * 5); + vst1q_u8(s, q5u8); + s += p; + vst1q_u8(s, q6u8); + s += p; + vst1q_u8(s, q7u8); + s += p; + vst1q_u8(s, q8u8); + return; +} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm new file mode 100644 index 000000000..8b54984d5 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm @@ -0,0 +1,252 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_4_neon| + EXPORT |aom_lpf_vertical_4_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently aom only works on iterations 8 at a time. The aom loop filter +; works on 16 iterations at a time. +; +; void aom_lpf_horizontal_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_horizontal_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r2, [sp, #4] ; load thresh + add r1, r1, r1 ; double pitch + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + + sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r3, r2, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r2@64], r1 ; p3 + vld1.u8 {d4}, [r3@64], r1 ; p2 + vld1.u8 {d5}, [r2@64], r1 ; p1 + vld1.u8 {d6}, [r3@64], r1 ; p0 + vld1.u8 {d7}, [r2@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r2@64] ; q2 + vld1.u8 {d18}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl aom_loop_filter_neon + + vst1.u8 {d4}, [r2@64], r1 ; store op1 + vst1.u8 {d5}, [r3@64], r1 ; store op0 + vst1.u8 {d6}, [r2@64], r1 ; store oq0 + vst1.u8 {d7}, [r3@64], r1 ; store oq1 + + pop {pc} + ENDP ; |aom_lpf_horizontal_4_neon| + +; Currently aom only works on iterations 8 at a time. The aom loop filter +; works on 16 iterations at a time. +; +; void aom_lpf_vertical_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_vertical_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #4] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + bl aom_loop_filter_neon + + sub r0, r0, #2 + + ;store op1, op0, oq0, oq1 + vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 + vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 + vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 + vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 + vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 + vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 + vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 + vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] + + pop {pc} + ENDP ; |aom_lpf_vertical_4_neon| + +; void aom_loop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d4 op1 +; d5 op0 +; d6 oq0 +; d7 oq1 +|aom_loop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d17, d6, d7 ; abs(p0 - q0) + + vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) + + vmov.u8 d18, #0x80 + + vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) + + ; hevmask + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) + + vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 + + veor d7, d7, d18 ; qs0 + + vcge.u8 d23, d1, d23 ; abs(m1) > limit + + ; filter() function + ; convert to signed + + vshr.u8 d28, d28, #1 ; a = a / 2 + veor d6, d6, d18 ; ps0 + + veor d5, d5, d18 ; ps1 + vqadd.u8 d17, d17, d28 ; a = b + a + + veor d16, d16, d18 ; qs1 + + vmov.u8 d19, #3 + + vsub.s8 d28, d7, d6 ; ( qs0 - ps0) + + vcge.u8 d17, d0, d17 ; a > blimit + + vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) + vorr d22, d21, d22 ; hevmask + + vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) + + vand d27, d27, d22 ; filter &= hev + vand d23, d23, d17 ; filter_mask + + vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d17, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d27, q12 + + vand d27, d27, d23 ; filter &= mask + + vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) + vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) + vshr.s8 d28, d28, #3 ; filter2 >>= 3 + vshr.s8 d27, d27, #3 ; filter1 >>= 3 + + vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) + vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 + + veor d6, d26, d18 ; *oq0 = u^0x80 + + vbic d27, d27, d22 ; filter &= ~hev + + vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) + vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) + + veor d5, d19, d18 ; *op0 = u^0x80 + veor d4, d21, d18 ; *op1 = u^0x80 + veor d7, d20, d18 ; *oq1 = u^0x80 + + bx lr + ENDP ; |aom_loop_filter_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c new file mode 100644 index 000000000..2b1f80b81 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p3 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d4ru8, // p1 + uint8x8_t *d5ru8, // p0 + uint8x8_t *d6ru8, // q0 + uint8x8_t *d7ru8) { // q1 + uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; + int16x8_t q12s16; + int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d3u8 = vabd_u8(d17u8, d16u8); + d4u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + d3u8 = vmax_u8(d3u8, d4u8); + d23u8 = vmax_u8(d19u8, d20u8); + + d17u8 = vabd_u8(d6u8, d7u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + d22u8 = vcgt_u8(d22u8, dthresh); + d23u8 = vmax_u8(d23u8, d3u8); + + d28u8 = vabd_u8(d5u8, d16u8); + d17u8 = vqadd_u8(d17u8, d17u8); + + d23u8 = vcge_u8(dlimit, d23u8); + + d18u8 = vdup_n_u8(0x80); + d5u8 = veor_u8(d5u8, d18u8); + d6u8 = veor_u8(d6u8, d18u8); + d7u8 = veor_u8(d7u8, d18u8); + d16u8 = veor_u8(d16u8, d18u8); + + d28u8 = vshr_n_u8(d28u8, 1); + d17u8 = vqadd_u8(d17u8, d28u8); + + d19u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8)); + + d17u8 = vcge_u8(dblimit, d17u8); + + d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8)); + + d22u8 = vorr_u8(d21u8, d22u8); + + q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); + d23u8 = vand_u8(d23u8, d17u8); + + q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); + + d17u8 = vdup_n_u8(4); + + d27s8 = vqmovn_s16(q12s16); + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); + d27s8 = vreinterpret_s8_u8(d27u8); + + d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); + d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); + d28s8 = vshr_n_s8(d28s8, 3); + d27s8 = vshr_n_s8(d27s8, 3); + + d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); + + d27s8 = vrshr_n_s8(d27s8, 1); + d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); + + d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); + d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); + + *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); + *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); + *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); + *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); + return; +} + +void aom_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < 1; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); + + s -= (pitch * 5); + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + s += pitch; + vst1_u8(s, d6u8); + s += pitch; + vst1_u8(s, d7u8); + } + return; +} + +void aom_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i, pitch8; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + pitch8 = pitch * 8; + for (i = 0; i < 1; i++, src += pitch8) { + s = src - (i + 1) * 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); + + d4Result.val[0] = d4u8; + d4Result.val[1] = d5u8; + d4Result.val[2] = d6u8; + d4Result.val[3] = d7u8; + + src -= 2; + vst4_lane_u8(src, d4Result, 0); + src += pitch; + vst4_lane_u8(src, d4Result, 1); + src += pitch; + vst4_lane_u8(src, d4Result, 2); + src += pitch; + vst4_lane_u8(src, d4Result, 3); + src += pitch; + vst4_lane_u8(src, d4Result, 4); + src += pitch; + vst4_lane_u8(src, d4Result, 5); + src += pitch; + vst4_lane_u8(src, d4Result, 6); + src += pitch; + vst4_lane_u8(src, d4Result, 7); + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm new file mode 100644 index 000000000..9f3db66ee --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm @@ -0,0 +1,428 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_8_neon| + EXPORT |aom_lpf_vertical_8_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently aom only works on iterations 8 at a time. The aom loop filter +; works on 16 iterations at a time. +; +; void aom_lpf_horizontal_8_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_horizontal_8_neon| PROC + push {r4-r5, lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r2, [sp, #12] ; load thresh + add r1, r1, r1 ; double pitch + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + + sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r2, r3, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r3@64], r1 ; p3 + vld1.u8 {d4}, [r2@64], r1 ; p2 + vld1.u8 {d5}, [r3@64], r1 ; p1 + vld1.u8 {d6}, [r2@64], r1 ; p0 + vld1.u8 {d7}, [r3@64], r1 ; q0 + vld1.u8 {d16}, [r2@64], r1 ; q1 + vld1.u8 {d17}, [r3@64] ; q2 + vld1.u8 {d18}, [r2@64], r1 ; q3 + + sub r3, r3, r1, lsl #1 + sub r2, r2, r1, lsl #2 + + bl aom_mbloop_filter_neon + + vst1.u8 {d0}, [r2@64], r1 ; store op2 + vst1.u8 {d1}, [r3@64], r1 ; store op1 + vst1.u8 {d2}, [r2@64], r1 ; store op0 + vst1.u8 {d3}, [r3@64], r1 ; store oq0 + vst1.u8 {d4}, [r2@64], r1 ; store oq1 + vst1.u8 {d5}, [r3@64], r1 ; store oq2 + + pop {r4-r5, pc} + + ENDP ; |aom_lpf_horizontal_8_neon| + +; void aom_lpf_vertical_8_neon(uint8_t *s, +; int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_vertical_8_neon| PROC + push {r4-r5, lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #12] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + sub r2, r0, #3 + add r3, r0, #1 + + bl aom_mbloop_filter_neon + + ;store op2, op1, op0, oq0 + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1 + vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1 + vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1 + vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1 + vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1 + vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1 + vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1 + vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2] + + ;store oq1, oq2 + vst2.8 {d4[0], d5[0]}, [r3], r1 + vst2.8 {d4[1], d5[1]}, [r3], r1 + vst2.8 {d4[2], d5[2]}, [r3], r1 + vst2.8 {d4[3], d5[3]}, [r3], r1 + vst2.8 {d4[4], d5[4]}, [r3], r1 + vst2.8 {d4[5], d5[5]}, [r3], r1 + vst2.8 {d4[6], d5[6]}, [r3], r1 + vst2.8 {d4[7], d5[7]}, [r3] + + pop {r4-r5, pc} + ENDP ; |aom_lpf_vertical_8_neon| + +; void aom_mbloop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d0 op2 +; d1 op1 +; d2 op0 +; d3 oq0 +; d4 oq1 +; d5 oq2 +|aom_mbloop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2) + + vmax.u8 d23, d23, d24 ; m3 = max(m5, m6) + + vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2) + + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0) + vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0) + vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d19, d1, d19 + + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; m4 = max(m7, m8) + vmax.u8 d26, d27, d28 ; m5 = max(m10, m11) + + vshr.u8 d23, d23, #1 ; a = a / 2 + + vmax.u8 d25, d25, d26 ; m4 = max(m4, m5) + + vqadd.u8 d24, d24, d23 ; a = b + a + + vmax.u8 d20, d20, d25 ; m2 = max(m2, m4) + + vmov.u8 d23, #1 + vcge.u8 d24, d0, d24 ; a > blimit + + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + + vcge.u8 d20, d23, d20 ; flat + + vand d19, d19, d24 ; mask + + vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + + vand d20, d20, d19 ; flat & mask + + vmov.u8 d22, #0x80 + + vorr d23, d21, d23 ; hev + + ; This instruction will truncate the "flat & mask" masks down to 4 bits + ; each to fit into one 32 bit arm register. The values are stored in + ; q10.64[0]. + vshrn.u16 d30, q10, #4 + vmov.u32 r4, d30[0] ; flat & mask 4bits + + adds r5, r4, #1 ; Check for all 1's + + ; If mask and flat are 1's for all vectors, then we only need to execute + ; the power branch for all vectors. + beq power_branch_only + + cmp r4, #0 ; Check for 0, set flag for later + + ; mbfilter() function + ; filter() function + ; convert to signed + veor d21, d7, d22 ; qs0 + veor d24, d6, d22 ; ps0 + veor d25, d5, d22 ; ps1 + veor d26, d16, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d21, d24 ; ( qs0 - ps0) + + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + + vand d29, d29, d23 ; filter &= hev + + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d29, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 + + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d23 ; filter &= ~hev + + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + + ; If mask and flat are 0's for all vectors, then we only need to execute + ; the filter branch for all vectors. + beq filter_branch_only + + ; If mask and flat are mixed then we must perform both branches and + ; combine the data. + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d21, d21, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 + + ; At this point we have already executed the filter branch. The filter + ; branch does not set op2 or oq2, so use p2 and q2. Execute the power + ; branch and combine the data. + vmov.u8 d23, #2 + vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3 + vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2 + + vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask) + + vaddw.u8 q14, d5 ; r_op2 += p1 + + vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask) + + vqrshrn.u16 d30, q14, #3 ; r_op2 + + vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3 + vsubw.u8 q14, d4 ; r_op1 -= p2 + vaddw.u8 q14, d5 ; r_op1 += p1 + vaddw.u8 q14, d16 ; r_op1 += q1 + + vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask) + + vqrshrn.u16 d31, q14, #3 ; r_op1 + + vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3 + vsubw.u8 q14, d5 ; r_op0 -= p1 + vaddw.u8 q14, d6 ; r_op0 += p0 + vaddw.u8 q14, d17 ; r_op0 += q2 + + vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask) + + vqrshrn.u16 d23, q14, #3 ; r_op0 + + vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3 + vsubw.u8 q14, d6 ; r_oq0 -= p0 + vaddw.u8 q14, d7 ; r_oq0 += q0 + + vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask) + + vaddw.u8 q14, d18 ; oq0 += q3 + + vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask) + + vqrshrn.u16 d22, q14, #3 ; r_oq0 + + vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2 + vsubw.u8 q14, d7 ; r_oq1 -= q0 + vaddw.u8 q14, d16 ; r_oq1 += q1 + + vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask) + + vaddw.u8 q14, d18 ; r_oq1 += q3 + + vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) + + vqrshrn.u16 d6, q14, #3 ; r_oq1 + + vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1 + vsubw.u8 q14, d16 ; r_oq2 -= q1 + vaddw.u8 q14, d17 ; r_oq2 += q2 + vaddw.u8 q14, d18 ; r_oq2 += q3 + + vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask) + + vqrshrn.u16 d7, q14, #3 ; r_oq2 + + vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask) + vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask) + vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask) + + bx lr + +power_branch_only + vmov.u8 d27, #3 + vmov.u8 d21, #2 + vaddl.u8 q14, d6, d7 ; op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 + vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 + vaddw.u8 q14, d5 ; op2 += p1 + vqrshrn.u16 d0, q14, #3 ; op2 + + vsubw.u8 q14, d3 ; op1 = op2 - p3 + vsubw.u8 q14, d4 ; op1 -= p2 + vaddw.u8 q14, d5 ; op1 += p1 + vaddw.u8 q14, d16 ; op1 += q1 + vqrshrn.u16 d1, q14, #3 ; op1 + + vsubw.u8 q14, d3 ; op0 = op1 - p3 + vsubw.u8 q14, d5 ; op0 -= p1 + vaddw.u8 q14, d6 ; op0 += p0 + vaddw.u8 q14, d17 ; op0 += q2 + vqrshrn.u16 d2, q14, #3 ; op0 + + vsubw.u8 q14, d3 ; oq0 = op0 - p3 + vsubw.u8 q14, d6 ; oq0 -= p0 + vaddw.u8 q14, d7 ; oq0 += q0 + vaddw.u8 q14, d18 ; oq0 += q3 + vqrshrn.u16 d3, q14, #3 ; oq0 + + vsubw.u8 q14, d4 ; oq1 = oq0 - p2 + vsubw.u8 q14, d7 ; oq1 -= q0 + vaddw.u8 q14, d16 ; oq1 += q1 + vaddw.u8 q14, d18 ; oq1 += q3 + vqrshrn.u16 d4, q14, #3 ; oq1 + + vsubw.u8 q14, d5 ; oq2 = oq1 - p1 + vsubw.u8 q14, d16 ; oq2 -= q1 + vaddw.u8 q14, d17 ; oq2 += q2 + vaddw.u8 q14, d18 ; oq2 += q3 + vqrshrn.u16 d5, q14, #3 ; oq2 + + bx lr + +filter_branch_only + ; TODO(fgalligan): See if we can rearange registers so we do not need to + ; do the 2 vswp. + vswp d0, d4 ; op2 + vswp d5, d17 ; oq2 + veor d2, d24, d22 ; *op0 = u^0x80 + veor d3, d21, d22 ; *oq0 = u^0x80 + veor d1, d25, d22 ; *op1 = u^0x80 + veor d4, d26, d22 ; *oq1 = u^0x80 + + bx lr + + ENDP ; |aom_mbloop_filter_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c new file mode 100644 index 000000000..c4502fdb5 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p2 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d0ru8, // p1 + uint8x8_t *d1ru8, // p1 + uint8x8_t *d2ru8, // p0 + uint8x8_t *d3ru8, // q0 + uint8x8_t *d4ru8, // q1 + uint8x8_t *d5ru8) { // q1 + uint32_t flat; + uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; + uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int16x8_t q15s16; + uint16x8_t q10u16, q14u16; + int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d23u8 = vabd_u8(d17u8, d16u8); + d24u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + + d25u8 = vabd_u8(d6u8, d4u8); + + d23u8 = vmax_u8(d23u8, d24u8); + + d26u8 = vabd_u8(d7u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + + d24u8 = vabd_u8(d6u8, d7u8); + d27u8 = vabd_u8(d3u8, d6u8); + d28u8 = vabd_u8(d18u8, d7u8); + + d19u8 = vmax_u8(d19u8, d23u8); + + d23u8 = vabd_u8(d5u8, d16u8); + d24u8 = vqadd_u8(d24u8, d24u8); + + d19u8 = vcge_u8(dlimit, d19u8); + + d25u8 = vmax_u8(d25u8, d26u8); + d26u8 = vmax_u8(d27u8, d28u8); + + d23u8 = vshr_n_u8(d23u8, 1); + + d25u8 = vmax_u8(d25u8, d26u8); + + d24u8 = vqadd_u8(d24u8, d23u8); + + d20u8 = vmax_u8(d20u8, d25u8); + + d23u8 = vdup_n_u8(1); + d24u8 = vcge_u8(dblimit, d24u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + + d20u8 = vcge_u8(d23u8, d20u8); + + d19u8 = vand_u8(d19u8, d24u8); + + d23u8 = vcgt_u8(d22u8, dthresh); + + d20u8 = vand_u8(d20u8, d19u8); + + d22u8 = vdup_n_u8(0x80); + + d23u8 = vorr_u8(d21u8, d23u8); + + q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8)); + + d30u8 = vshrn_n_u16(q10u16, 4); + flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); + + if (flat == 0xffffffff) { // Check for all 1's, power_branch_only + d27u8 = vdup_n_u8(3); + d21u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d21u8); + q14u16 = vaddw_u8(q14u16, d5u8); + *d0ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + *d1ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + *d2ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d3ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d4ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d5ru8 = vqrshrn_n_u16(q14u16, 3); + } else { + d21u8 = veor_u8(d7u8, d22u8); + d24u8 = veor_u8(d6u8, d22u8); + d25u8 = veor_u8(d5u8, d22u8); + d26u8 = veor_u8(d16u8, d22u8); + + d27u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); + d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); + + q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); + + d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + q15s16 = vaddw_s8(q15s16, d29s8); + + d29u8 = vdup_n_u8(4); + + d28s8 = vqmovn_s16(q15s16); + + d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); + d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); + d30s8 = vshr_n_s8(d30s8, 3); + d29s8 = vshr_n_s8(d29s8, 3); + + d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); + d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); + + d29s8 = vrshr_n_s8(d29s8, 1); + d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); + + if (flat == 0) { // filter_branch_only + *d0ru8 = d4u8; + *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + *d5ru8 = d17u8; + return; + } + + d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + + d23u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d23u8); + + d0u8 = vbsl_u8(d20u8, dblimit, d4u8); + + q14u16 = vaddw_u8(q14u16, d5u8); + + d1u8 = vbsl_u8(d20u8, dlimit, d25u8); + + d30u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d2u8 = vbsl_u8(d20u8, dthresh, d24u8); + + d31u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + + *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); + + d23u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + + *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); + + d22u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d3u8 = vbsl_u8(d20u8, d3u8, d21u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + d4u8 = vbsl_u8(d20u8, d4u8, d26u8); + + d6u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + + d5u8 = vbsl_u8(d20u8, d5u8, d17u8); + + d7u8 = vqrshrn_n_u16(q14u16, 3); + + *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); + *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); + *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); + } + return; +} + +void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < 1; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, + &d5u8); + + s -= (pitch * 6); + vst1_u8(s, d0u8); + s += pitch; + vst1_u8(s, d1u8); + s += pitch; + vst1_u8(s, d2u8); + s += pitch; + vst1_u8(s, d3u8); + s += pitch; + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + } + return; +} + +void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + uint8x8x2_t d2Result; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + for (i = 0; i < 1; i++) { + s = src + (i * (pitch << 3)) - 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, + d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, + &d5u8); + + d4Result.val[0] = d0u8; + d4Result.val[1] = d1u8; + d4Result.val[2] = d2u8; + d4Result.val[3] = d3u8; + + d2Result.val[0] = d4u8; + d2Result.val[1] = d5u8; + + s = src - 3; + vst4_lane_u8(s, d4Result, 0); + s += pitch; + vst4_lane_u8(s, d4Result, 1); + s += pitch; + vst4_lane_u8(s, d4Result, 2); + s += pitch; + vst4_lane_u8(s, d4Result, 3); + s += pitch; + vst4_lane_u8(s, d4Result, 4); + s += pitch; + vst4_lane_u8(s, d4Result, 5); + s += pitch; + vst4_lane_u8(s, d4Result, 6); + s += pitch; + vst4_lane_u8(s, d4Result, 7); + + s = src + 1; + vst2_lane_u8(s, d2Result, 0); + s += pitch; + vst2_lane_u8(s, d2Result, 1); + s += pitch; + vst2_lane_u8(s, d2Result, 2); + s += pitch; + vst2_lane_u8(s, d2Result, 3); + s += pitch; + vst2_lane_u8(s, d2Result, 4); + s += pitch; + vst2_lane_u8(s, d2Result, 5); + s += pitch; + vst2_lane_u8(s, d2Result, 6); + s += pitch; + vst2_lane_u8(s, d2Result, 7); + } + return; +} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm new file mode 100644 index 000000000..675928860 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm @@ -0,0 +1,638 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_edge_8_neon| + EXPORT |aom_lpf_horizontal_edge_16_neon| + EXPORT |aom_lpf_vertical_16_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; void mb_lpf_horizontal_edge(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; r12 int count +|mb_lpf_horizontal_edge| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + +h_count + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines + + vld1.u8 {d0}, [r8@64], r1 ; p7 + vld1.u8 {d1}, [r8@64], r1 ; p6 + vld1.u8 {d2}, [r8@64], r1 ; p5 + vld1.u8 {d3}, [r8@64], r1 ; p4 + vld1.u8 {d4}, [r8@64], r1 ; p3 + vld1.u8 {d5}, [r8@64], r1 ; p2 + vld1.u8 {d6}, [r8@64], r1 ; p1 + vld1.u8 {d7}, [r8@64], r1 ; p0 + vld1.u8 {d8}, [r8@64], r1 ; q0 + vld1.u8 {d9}, [r8@64], r1 ; q1 + vld1.u8 {d10}, [r8@64], r1 ; q2 + vld1.u8 {d11}, [r8@64], r1 ; q3 + vld1.u8 {d12}, [r8@64], r1 ; q4 + vld1.u8 {d13}, [r8@64], r1 ; q5 + vld1.u8 {d14}, [r8@64], r1 ; q6 + vld1.u8 {d15}, [r8@64], r1 ; q7 + + bl aom_wide_mbfilter_neon + + tst r7, #1 + beq h_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r8, r0, r1, lsl #1 + + vst1.u8 {d25}, [r8@64], r1 ; store op1 + vst1.u8 {d24}, [r8@64], r1 ; store op0 + vst1.u8 {d23}, [r8@64], r1 ; store oq0 + vst1.u8 {d26}, [r8@64], r1 ; store oq1 + + b h_next + +h_mbfilter + tst r7, #2 + beq h_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, r1, lsl #1 + sub r8, r8, r1 + + vst1.u8 {d18}, [r8@64], r1 ; store op2 + vst1.u8 {d19}, [r8@64], r1 ; store op1 + vst1.u8 {d20}, [r8@64], r1 ; store op0 + vst1.u8 {d21}, [r8@64], r1 ; store oq0 + vst1.u8 {d22}, [r8@64], r1 ; store oq1 + vst1.u8 {d23}, [r8@64], r1 ; store oq2 + + b h_next + +h_wide_mbfilter + sub r8, r0, r1, lsl #3 + add r8, r8, r1 + + vst1.u8 {d16}, [r8@64], r1 ; store op6 + vst1.u8 {d24}, [r8@64], r1 ; store op5 + vst1.u8 {d25}, [r8@64], r1 ; store op4 + vst1.u8 {d26}, [r8@64], r1 ; store op3 + vst1.u8 {d27}, [r8@64], r1 ; store op2 + vst1.u8 {d18}, [r8@64], r1 ; store op1 + vst1.u8 {d19}, [r8@64], r1 ; store op0 + vst1.u8 {d20}, [r8@64], r1 ; store oq0 + vst1.u8 {d21}, [r8@64], r1 ; store oq1 + vst1.u8 {d22}, [r8@64], r1 ; store oq2 + vst1.u8 {d23}, [r8@64], r1 ; store oq3 + vst1.u8 {d1}, [r8@64], r1 ; store oq4 + vst1.u8 {d2}, [r8@64], r1 ; store oq5 + vst1.u8 {d3}, [r8@64], r1 ; store oq6 + +h_next + add r0, r0, #8 + subs r12, r12, #1 + bne h_count + + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |mb_lpf_horizontal_edge| + +; void aom_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|aom_lpf_horizontal_edge_8_neon| PROC + mov r12, #1 + b mb_lpf_horizontal_edge + ENDP ; |aom_lpf_horizontal_edge_8_neon| + +; void aom_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh +|aom_lpf_horizontal_edge_16_neon| PROC + mov r12, #2 + b mb_lpf_horizontal_edge + ENDP ; |aom_lpf_horizontal_edge_16_neon| + +; void aom_lpf_vertical_16_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_vertical_16_neon| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, #8 + + vld1.8 {d0}, [r8@64], r1 + vld1.8 {d8}, [r0@64], r1 + vld1.8 {d1}, [r8@64], r1 + vld1.8 {d9}, [r0@64], r1 + vld1.8 {d2}, [r8@64], r1 + vld1.8 {d10}, [r0@64], r1 + vld1.8 {d3}, [r8@64], r1 + vld1.8 {d11}, [r0@64], r1 + vld1.8 {d4}, [r8@64], r1 + vld1.8 {d12}, [r0@64], r1 + vld1.8 {d5}, [r8@64], r1 + vld1.8 {d13}, [r0@64], r1 + vld1.8 {d6}, [r8@64], r1 + vld1.8 {d14}, [r0@64], r1 + vld1.8 {d7}, [r8@64], r1 + vld1.8 {d15}, [r0@64], r1 + + sub r0, r0, r1, lsl #3 + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + vtrn.8 d4, d5 + vtrn.8 d6, d7 + + vtrn.8 d8, d9 + vtrn.8 d10, d11 + vtrn.8 d12, d13 + vtrn.8 d14, d15 + + bl aom_wide_mbfilter_neon + + tst r7, #1 + beq v_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r8, r0, #2 + + vswp d23, d25 + + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 + + b v_end + +v_mbfilter + tst r7, #2 + beq v_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, #3 + + vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 + vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 + vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 + vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 + vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 + vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 + vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 + vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 + vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 + vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 + vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 + vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 + vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 + vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 + vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 + vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 + + b v_end + +v_wide_mbfilter + sub r8, r0, #8 + + vtrn.32 d0, d26 + vtrn.32 d16, d27 + vtrn.32 d24, d18 + vtrn.32 d25, d19 + + vtrn.16 d0, d24 + vtrn.16 d16, d25 + vtrn.16 d26, d18 + vtrn.16 d27, d19 + + vtrn.8 d0, d16 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + vtrn.8 d18, d19 + + vtrn.32 d20, d1 + vtrn.32 d21, d2 + vtrn.32 d22, d3 + vtrn.32 d23, d15 + + vtrn.16 d20, d22 + vtrn.16 d21, d23 + vtrn.16 d1, d3 + vtrn.16 d2, d15 + + vtrn.8 d20, d21 + vtrn.8 d22, d23 + vtrn.8 d1, d2 + vtrn.8 d3, d15 + + vst1.8 {d0}, [r8@64], r1 + vst1.8 {d20}, [r0@64], r1 + vst1.8 {d16}, [r8@64], r1 + vst1.8 {d21}, [r0@64], r1 + vst1.8 {d24}, [r8@64], r1 + vst1.8 {d22}, [r0@64], r1 + vst1.8 {d25}, [r8@64], r1 + vst1.8 {d23}, [r0@64], r1 + vst1.8 {d26}, [r8@64], r1 + vst1.8 {d1}, [r0@64], r1 + vst1.8 {d27}, [r8@64], r1 + vst1.8 {d2}, [r0@64], r1 + vst1.8 {d18}, [r8@64], r1 + vst1.8 {d3}, [r0@64], r1 + vst1.8 {d19}, [r8@64], r1 + vst1.8 {d15}, [r0@64], r1 + +v_end + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |aom_lpf_vertical_16_neon| + +; void aom_wide_mbfilter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. +; +; r0-r3 PRESERVE +; d16 blimit +; d17 limit +; d18 thresh +; d0 p7 +; d1 p6 +; d2 p5 +; d3 p4 +; d4 p3 +; d5 p2 +; d6 p1 +; d7 p0 +; d8 q0 +; d9 q1 +; d10 q2 +; d11 q3 +; d12 q4 +; d13 q5 +; d14 q6 +; d15 q7 +|aom_wide_mbfilter_neon| PROC + mov r7, #0 + + ; filter_mask + vabd.u8 d19, d4, d5 ; abs(p3 - p2) + vabd.u8 d20, d5, d6 ; abs(p2 - p1) + vabd.u8 d21, d6, d7 ; abs(p1 - p0) + vabd.u8 d22, d9, d8 ; abs(q1 - q0) + vabd.u8 d23, d10, d9 ; abs(q2 - q1) + vabd.u8 d24, d11, d10 ; abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1)) + vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0)) + vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d7, d8 ; abs(p0 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d6, d9 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d19, d17, d19 + + ; flatmask4 + vabd.u8 d25, d7, d5 ; abs(p0 - p2) + vabd.u8 d26, d8, d10 ; abs(q0 - q2) + vabd.u8 d27, d4, d7 ; abs(p3 - p0) + vabd.u8 d28, d11, d8 ; abs(q3 - q0) + + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2)) + vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0)) + vmax.u8 d25, d25, d26 + vmax.u8 d20, d20, d25 + + vshr.u8 d23, d23, #1 ; a = a / 2 + vqadd.u8 d24, d24, d23 ; a = b + a + + vmov.u8 d30, #1 + vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1 + + vcge.u8 d20, d30, d20 ; flat + + vand d19, d19, d24 ; mask + + ; hevmask + vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1 + vorr d21, d21, d22 ; hev + + vand d16, d20, d19 ; flat && mask + vmov r5, r6, d16 + + ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) + vabd.u8 d22, d3, d7 ; abs(p4 - p0) + vabd.u8 d23, d12, d8 ; abs(q4 - q0) + vabd.u8 d24, d7, d2 ; abs(p0 - p5) + vabd.u8 d25, d8, d13 ; abs(q0 - q5) + vabd.u8 d26, d1, d7 ; abs(p6 - p0) + vabd.u8 d27, d14, d8 ; abs(q6 - q0) + vabd.u8 d28, d0, d7 ; abs(p7 - p0) + vabd.u8 d29, d15, d8 ; abs(q7 - q0) + + ; only compare the largest value to thresh + vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0)) + vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5)) + vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0)) + vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0)) + + vmax.u8 d26, d22, d23 + vmax.u8 d27, d24, d25 + vmax.u8 d23, d26, d27 + + vcge.u8 d18, d30, d23 ; flat2 + + vmov.u8 d22, #0x80 + + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #1 ; Only do filter branch + + vand d17, d18, d16 ; flat2 && flat && mask + vmov r5, r6, d17 + + ; mbfilter() function + + ; filter() function + ; convert to signed + veor d23, d8, d22 ; qs0 + veor d24, d7, d22 ; ps0 + veor d25, d6, d22 ; ps1 + veor d26, d9, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d23, d24 ; ( qs0 - ps0) + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + vand d29, d29, d21 ; filter &= hev + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + vmov.u8 d29, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 + + + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d21 ; filter &= ~hev + + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d23, d23, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 + + tst r7, #1 + bxne lr + + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #2 ; Only do mbfilter branch + + ; mbfilter flat && mask branch + ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's + ; and using vibt on the q's? + vmov.u8 d29, #2 + vaddl.u8 q15, d7, d8 ; op2 = p0 + q0 + vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3 + vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddl.u8 q10, d4, d5 + vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vaddl.u8 q14, d6, d9 + vqrshrn.u16 d18, q15, #3 ; r_op2 + + vsub.i16 q15, q10 + vaddl.u8 q10, d4, d6 + vadd.i16 q15, q14 + vaddl.u8 q14, d7, d10 + vqrshrn.u16 d19, q15, #3 ; r_op1 + + vsub.i16 q15, q10 + vadd.i16 q15, q14 + vaddl.u8 q14, d8, d11 + vqrshrn.u16 d20, q15, #3 ; r_op0 + + vsubw.u8 q15, d4 ; oq0 = op0 - p3 + vsubw.u8 q15, d7 ; oq0 -= p0 + vadd.i16 q15, q14 + vaddl.u8 q14, d9, d11 + vqrshrn.u16 d21, q15, #3 ; r_oq0 + + vsubw.u8 q15, d5 ; oq1 = oq0 - p2 + vsubw.u8 q15, d8 ; oq1 -= q0 + vadd.i16 q15, q14 + vaddl.u8 q14, d10, d11 + vqrshrn.u16 d22, q15, #3 ; r_oq1 + + vsubw.u8 q15, d6 ; oq2 = oq0 - p1 + vsubw.u8 q15, d9 ; oq2 -= q1 + vadd.i16 q15, q14 + vqrshrn.u16 d27, q15, #3 ; r_oq2 + + ; Filter does not set op2 or oq2, so use p2 and q2. + vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask) + vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask) + vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask) + vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask) + vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask) + + vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask) + vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask) + + tst r7, #2 + bxne lr + + ; wide_mbfilter flat2 && flat && mask branch + vmov.u8 d16, #7 + vaddl.u8 q15, d7, d8 ; op6 = p0 + q0 + vaddl.u8 q12, d2, d3 + vaddl.u8 q13, d4, d5 + vaddl.u8 q14, d1, d6 + vmlal.u8 q15, d0, d16 ; op6 += p7 * 3 + vadd.i16 q12, q13 + vadd.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vadd.i16 q15, q12 + vaddl.u8 q12, d0, d1 + vaddw.u8 q15, d1 + vaddl.u8 q13, d0, d2 + vadd.i16 q14, q15, q14 + vqrshrn.u16 d16, q15, #4 ; w_op6 + + vsub.i16 q15, q14, q12 + vaddl.u8 q14, d3, d10 + vqrshrn.u16 d24, q15, #4 ; w_op5 + + vsub.i16 q15, q13 + vaddl.u8 q13, d0, d3 + vadd.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vqrshrn.u16 d25, q15, #4 ; w_op4 + + vadd.i16 q15, q14 + vaddl.u8 q14, d0, d4 + vsub.i16 q15, q13 + vsub.i16 q14, q15, q14 + vqrshrn.u16 d26, q15, #4 ; w_op3 + + vaddw.u8 q15, q14, d5 ; op2 += p2 + vaddl.u8 q14, d0, d5 + vaddw.u8 q15, d12 ; op2 += q4 + vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) + vqrshrn.u16 d27, q15, #4 ; w_op2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d6 + vaddw.u8 q15, d6 ; op1 += p1 + vaddw.u8 q15, d13 ; op1 += q5 + vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) + vqrshrn.u16 d18, q15, #4 ; w_op1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d0, d7 + vaddw.u8 q15, d7 ; op0 += p0 + vaddw.u8 q15, d14 ; op0 += q6 + vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) + vqrshrn.u16 d19, q15, #4 ; w_op0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d1, d8 + vaddw.u8 q15, d8 ; oq0 += q0 + vaddw.u8 q15, d15 ; oq0 += q7 + vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) + vqrshrn.u16 d20, q15, #4 ; w_oq0 + + vsub.i16 q15, q14 + vaddl.u8 q14, d2, d9 + vaddw.u8 q15, d9 ; oq1 += q1 + vaddl.u8 q4, d10, d15 + vaddw.u8 q15, d15 ; oq1 += q7 + vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) + vqrshrn.u16 d21, q15, #4 ; w_oq1 + + vsub.i16 q15, q14 + vaddl.u8 q14, d3, d10 + vadd.i16 q15, q4 + vaddl.u8 q4, d11, d15 + vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m) + vqrshrn.u16 d22, q15, #4 ; w_oq2 + + vsub.i16 q15, q14 + vaddl.u8 q14, d4, d11 + vadd.i16 q15, q4 + vaddl.u8 q4, d12, d15 + vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m) + vqrshrn.u16 d23, q15, #4 ; w_oq3 + + vsub.i16 q15, q14 + vaddl.u8 q14, d5, d12 + vadd.i16 q15, q4 + vaddl.u8 q4, d13, d15 + vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m) + vqrshrn.u16 d1, q15, #4 ; w_oq4 + + vsub.i16 q15, q14 + vaddl.u8 q14, d6, d13 + vadd.i16 q15, q4 + vaddl.u8 q4, d14, d15 + vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m) + vqrshrn.u16 d2, q15, #4 ; w_oq5 + + vsub.i16 q15, q14 + vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m) + vadd.i16 q15, q4 + vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m) + vqrshrn.u16 d3, q15, #4 ; w_oq6 + vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m) + vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m) + vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m) + + bx lr + ENDP ; |aom_wide_mbfilter_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c new file mode 100644 index 000000000..c90d6bfde --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" +#include "aom/aom_integer.h" + +void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); +} + +#if HAVE_NEON_ASM +void aom_lpf_horizontal_8_dual_neon( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh); + aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); +} +#endif // HAVE_NEON_ASM diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c new file mode 100644 index 000000000..a1eeaf4b7 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sad4d_neon.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, +// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo +// and vec_sum_ref_hi. +static void sad_neon_64(const uint8x16_t vec_src_00, + const uint8x16_t vec_src_16, + const uint8x16_t vec_src_32, + const uint8x16_t vec_src_48, const uint8_t *ref, + uint16x8_t *vec_sum_ref_lo, + uint16x8_t *vec_sum_ref_hi) { + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); +} + +// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, +// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. +static void sad_neon_32(const uint8x16_t vec_src_00, + const uint8x16_t vec_src_16, const uint8_t *ref, + uint16x8_t *vec_sum_ref_lo, + uint16x8_t *vec_sum_ref_hi) { + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); +} + +void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, + &vec_sum_ref0_lo, &vec_sum_ref0_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, + &vec_sum_ref1_lo, &vec_sum_ref1_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, + &vec_sum_ref2_lo, &vec_sum_ref2_hi); + sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, + &vec_sum_ref3_lo, &vec_sum_ref3_hi); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} + +void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + + sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo, + &vec_sum_ref0_hi); + sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo, + &vec_sum_ref1_hi); + sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo, + &vec_sum_ref2_hi); + sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo, + &vec_sum_ref3_hi); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} + +void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { + int i; + uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); + uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); + const uint8_t *ref0, *ref1, *ref2, *ref3; + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref0 = vld1q_u8(ref0); + const uint8x16_t vec_ref1 = vld1q_u8(ref1); + const uint8x16_t vec_ref2 = vld1q_u8(ref2); + const uint8x16_t vec_ref3 = vld1q_u8(ref3); + + vec_sum_ref0_lo = + vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0)); + vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref0)); + vec_sum_ref1_lo = + vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1)); + vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref1)); + vec_sum_ref2_lo = + vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2)); + vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref2)); + vec_sum_ref3_lo = + vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3)); + vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), + vget_high_u8(vec_ref3)); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + + res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); + res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); + res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); + res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); +} diff --git a/third_party/aom/aom_dsp/arm/sad_media.asm b/third_party/aom/aom_dsp/arm/sad_media.asm new file mode 100644 index 000000000..49ddb6764 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sad_media.asm @@ -0,0 +1,98 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_sad16x16_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 const unsigned char *src_ptr +; r1 int src_stride +; r2 const unsigned char *ref_ptr +; r3 int ref_stride +|aom_sad16x16_media| PROC + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + mov r4, #0 ; sad = 0; + mov r5, #8 ; loop count + +loop + ; 1st row + ldr r6, [r0, #0x0] ; load 4 src pixels (1A) + ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) + ldr r7, [r0, #0x4] ; load 4 src pixels (1A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) + ldr r10, [r0, #0x8] ; load 4 src pixels (1B) + ldr r11, [r0, #0xC] ; load 4 src pixels (1B) + + usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + ldr r6, [r0, #0x0] ; load 4 src pixels (2A) + ldr r7, [r0, #0x4] ; load 4 src pixels (2A) + add r4, r4, r8 ; add partial sad values + + ; 2nd row + ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) + ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) + ldr r10, [r0, #0x8] ; load 4 src pixels (2B) + ldr r11, [r0, #0xC] ; load 4 src pixels (2B) + + usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels + usad8 r8, r7, r9 ; calculate sad for 4 pixels + + ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) + ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) + + add r0, r0, r1 ; set src pointer to next row + add r2, r2, r3 ; set dst pointer to next row + + usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels + usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels + + pld [r0, r1, lsl #1] + pld [r2, r3, lsl #1] + + subs r5, r5, #1 ; decrement loop counter + add r4, r4, r8 ; add partial sad values + + bne loop + + mov r0, r4 ; return sad + ldmfd sp!, {r4-r12, pc} + + ENDP + + END + diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c new file mode 100644 index 000000000..2f452f55b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/sad_neon.c @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" + +#include "aom/aom_integer.h" + +unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 15; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x8_t d0, d8; + uint16x8_t q12; + uint32x2_t d1; + uint64x1_t d3; + int i; + + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(d0, d8); + + for (i = 0; i < 3; i++) { + d0 = vld1_u8(src_ptr); + src_ptr += src_stride; + d8 = vld1_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, d0, d8); + } + + d1 = vpaddl_u16(vget_low_u16(q12)); + d3 = vpaddl_u32(d1); + + return vget_lane_u32(vreinterpret_u32_u64(d3), 0); +} + +unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride, + unsigned char *ref_ptr, int ref_stride) { + uint8x16_t q0, q4; + uint16x8_t q12, q13; + uint32x4_t q1; + uint64x2_t q3; + uint32x2_t d5; + int i; + + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4)); + + for (i = 0; i < 7; i++) { + q0 = vld1q_u8(src_ptr); + src_ptr += src_stride; + q4 = vld1q_u8(ref_ptr); + ref_ptr += ref_stride; + q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4)); + q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4)); + } + + q12 = vaddq_u16(q12, q13); + q1 = vpaddlq_u16(q12); + q3 = vpaddlq_u32(q1); + d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)), + vreinterpret_u32_u64(vget_high_u64(q3))); + + return vget_lane_u32(d5, 0); +} + +static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} +static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { + const uint32x4_t a = vpaddlq_u16(vec_16x8); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +} + +unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + for (i = 0; i < 64; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_src_32 = vld1q_u8(src + 32); + const uint8x16_t vec_src_48 = vld1q_u8(src + 48); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); + const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), + vget_low_u8(vec_ref_32)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), + vget_high_u8(vec_ref_32)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), + vget_low_u8(vec_ref_48)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), + vget_high_u8(vec_ref_48)); + } + return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); +} + +unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 32; ++i) { + const uint8x16_t vec_src_00 = vld1q_u8(src); + const uint8x16_t vec_src_16 = vld1q_u8(src + 16); + const uint8x16_t vec_ref_00 = vld1q_u8(ref); + const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); + src += src_stride; + ref += ref_stride; + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), + vget_low_u8(vec_ref_00)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), + vget_high_u8(vec_ref_00)); + vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), + vget_low_u8(vec_ref_16)); + vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), + vget_high_u8(vec_ref_16)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum_lo = vdupq_n_u16(0); + uint16x8_t vec_accum_hi = vdupq_n_u16(0); + + for (i = 0; i < 16; ++i) { + const uint8x16_t vec_src = vld1q_u8(src); + const uint8x16_t vec_ref = vld1q_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum_lo = + vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref)); + vec_accum_hi = + vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref)); + } + return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); +} + +unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + int i; + uint16x8_t vec_accum = vdupq_n_u16(0); + + for (i = 0; i < 8; ++i) { + const uint8x8_t vec_src = vld1_u8(src); + const uint8x8_t vec_ref = vld1_u8(ref); + src += src_stride; + ref += ref_stride; + vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); + } + return horizontal_add_16x8(vec_accum); +} diff --git a/third_party/aom/aom_dsp/arm/save_reg_neon.asm b/third_party/aom/aom_dsp/arm/save_reg_neon.asm new file mode 100644 index 000000000..e04969823 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/save_reg_neon.asm @@ -0,0 +1,39 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_push_neon| + EXPORT |aom_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|aom_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + +|aom_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + + END + diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_media.c b/third_party/aom/aom_dsp/arm/subpel_variance_media.c new file mode 100644 index 000000000..46ec028d3 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/subpel_variance_media.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +#if HAVE_MEDIA +static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 }, + { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, + { 32, 96 }, { 16, 112 } }; + +extern void aom_filter_block2d_bil_first_pass_media( + const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch, + uint32_t height, uint32_t width, const int16_t *filter); + +extern void aom_filter_block2d_bil_second_pass_media( + const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch, + uint32_t height, uint32_t width, const int16_t *filter); + +unsigned int aom_sub_pixel_variance8x8_media( + const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, + const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { + uint16_t first_pass[10 * 8]; + uint8_t second_pass[8 * 8]; + const int16_t *HFilter, *VFilter; + + HFilter = bilinear_filters_media[xoffset]; + VFilter = bilinear_filters_media[yoffset]; + + aom_filter_block2d_bil_first_pass_media(src_ptr, first_pass, + src_pixels_per_line, 9, 8, HFilter); + aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8, + VFilter); + + return aom_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line, + sse); +} + +unsigned int aom_sub_pixel_variance16x16_media( + const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, + const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { + uint16_t first_pass[36 * 16]; + uint8_t second_pass[20 * 16]; + const int16_t *HFilter, *VFilter; + unsigned int var; + + if (xoffset == 4 && yoffset == 0) { + var = aom_variance_halfpixvar16x16_h_media( + src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + } else if (xoffset == 0 && yoffset == 4) { + var = aom_variance_halfpixvar16x16_v_media( + src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + } else if (xoffset == 4 && yoffset == 4) { + var = aom_variance_halfpixvar16x16_hv_media( + src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); + } else { + HFilter = bilinear_filters_media[xoffset]; + VFilter = bilinear_filters_media[yoffset]; + + aom_filter_block2d_bil_first_pass_media( + src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter); + aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16, + 16, VFilter); + + var = aom_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line, + sse); + } + return var; +} +#endif // HAVE_MEDIA diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c new file mode 100644 index 000000000..064b72d6f --- /dev/null +++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/variance.h" + +static const uint8_t bilinear_filters[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + const uint8x8_t f0 = vmov_n_u8(filter[0]); + const uint8x8_t f1 = vmov_n_u8(filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); + const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(src_0, f0); + const uint16x8_t b = vmlal_u8(a, src_1, f1); + const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); + vst1_u8(&output_ptr[0], out); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + const uint8x8_t f0 = vmov_n_u8(filter[0]); + const uint8x8_t f1 = vmov_n_u8(filter[1]); + unsigned int i, j; + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 16) { + const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]); + const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]); + const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); + const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); + const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); + const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); + const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); + const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); + vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi)); + } + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, + int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); + DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); + + var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, + bilinear_filters[xoffset]); + var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, + bilinear_filters[yoffset]); + return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse); +} + +unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); + DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, + bilinear_filters[xoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, + bilinear_filters[yoffset]); + return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse); +} + +unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); + DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, + bilinear_filters[xoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, + bilinear_filters[yoffset]); + return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse); +} + +unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src, + int src_stride, int xoffset, + int yoffset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); + DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, + bilinear_filters[xoffset]); + var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, + bilinear_filters[yoffset]); + return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse); +} diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c new file mode 100644 index 000000000..cb8a2daf8 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/subtract_neon.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" + +void aom_subtract_block_neon(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r, c; + + if (cols > 16) { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; c += 32) { + const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); + const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); + const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); + const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); + const uint16x8_t v_diff_lo_00 = + vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); + const uint16x8_t v_diff_hi_00 = + vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); + const uint16x8_t v_diff_lo_16 = + vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); + const uint16x8_t v_diff_hi_16 = + vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 8) { + for (r = 0; r < rows; ++r) { + const uint8x16_t v_src = vld1q_u8(&src[0]); + const uint8x16_t v_pred = vld1q_u8(&pred[0]); + const uint16x8_t v_diff_lo = + vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); + const uint16x8_t v_diff_hi = + vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 4) { + for (r = 0; r < rows; ++r) { + const uint8x8_t v_src = vld1_u8(&src[0]); + const uint8x8_t v_pred = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } +} diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm new file mode 100644 index 000000000..1e5c9178e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm @@ -0,0 +1,185 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_variance_halfpixvar16x16_h_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|aom_variance_halfpixvar16x16_h_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm new file mode 100644 index 000000000..9e0af830e --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm @@ -0,0 +1,225 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_variance_halfpixvar16x16_hv_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|aom_variance_halfpixvar16x16_hv_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; pointer to pixels on the next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load source pixels a, row N + ldr r6, [r0, #1] ; load source pixels b, row N + ldr r5, [r9, #0] ; load source pixels c, row N+1 + ldr r7, [r9, #1] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #0] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load source pixels a, row N + ldr r6, [r0, #5] ; load source pixels b, row N + ldr r5, [r9, #4] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #5] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #4] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load source pixels a, row N + ldr r6, [r0, #9] ; load source pixels b, row N + ldr r5, [r9, #8] ; load source pixels c, row N+1 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + ldr r7, [r9, #9] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #8] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load source pixels a, row N + ldr r6, [r0, #13] ; load source pixels b, row N + ldr r5, [r9, #12] ; load source pixels c, row N+1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + ldr r7, [r9, #13] ; load source pixels d, row N+1 + + ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 + mvn r7, r7 + uhsub8 r5, r5, r7 + eor r5, r5, r10 + ; z = (x + y + 1) >> 1, interpolate half pixel values vertically + mvn r5, r5 + uhsub8 r4, r4, r5 + ldr r5, [r2, #12] ; load 4 ref pixels + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm new file mode 100644 index 000000000..545b68179 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm @@ -0,0 +1,187 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_variance_halfpixvar16x16_v_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|aom_variance_halfpixvar16x16_v_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + ldr r10, c80808080 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + mov lr, #0 ; constant zero +loop + add r9, r0, r1 ; set src pointer to next row + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r6, [r9, #0] ; load 4 src pixels from next row + ldr r5, [r2, #0] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r6, [r9, #4] ; load 4 src pixels from next row + ldr r5, [r2, #4] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r6, [r9, #8] ; load 4 src pixels from next row + ldr r5, [r2, #8] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r6, [r9, #12] ; load 4 src pixels from next row + ldr r5, [r2, #12] ; load 4 ref pixels + + ; bilinear interpolation + mvn r6, r6 + uhsub8 r4, r4, r6 + eor r4, r4, r10 + + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r6, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r6, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r7, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +c80808080 + DCD 0x80808080 + + END + diff --git a/third_party/aom/aom_dsp/arm/variance_media.asm b/third_party/aom/aom_dsp/arm/variance_media.asm new file mode 100644 index 000000000..fdc311a81 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_media.asm @@ -0,0 +1,361 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + + EXPORT |aom_variance16x16_media| + EXPORT |aom_variance8x8_media| + EXPORT |aom_mse16x16_media| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|aom_variance16x16_media| PROC + + stmfd sp!, {r4-r12, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r8, #0 ; initialize sum = 0 + mov r11, #0 ; initialize sse = 0 + mov r12, #16 ; set loop counter to 16 (=block height) + +loop16x16 + ; 1st 4 pixels + ldr r4, [r0, #0] ; load 4 src pixels + ldr r5, [r2, #0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r6, r4, r5 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + ; calculate total sum + adds r8, r8, r4 ; add positive differences to sum + subs r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r4, [r0, #4] ; load 4 src pixels + ldr r5, [r2, #4] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r4, [r0, #8] ; load 4 src pixels + ldr r5, [r2, #8] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r4, [r0, #12] ; load 4 src pixels + ldr r5, [r2, #12] ; load 4 ref pixels + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + usub8 r6, r4, r5 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r6, lr ; select bytes with positive difference + usub8 r9, r5, r4 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r6, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r4, r7, lr ; calculate sum of positive differences + usad8 r5, r6, lr ; calculate sum of negative differences + orr r6, r6, r7 ; differences of all 4 pixels + + ; calculate total sum + add r8, r8, r4 ; add positive differences to sum + sub r8, r8, r5 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r5, r6 ; byte (two pixels) to halfwords + uxtb16 r10, r6, ror #8 ; another two pixels to halfwords + smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) + smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) + + + subs r12, r12, #1 + + bne loop16x16 + + ; return stuff + ldr r6, [sp, #40] ; get address of sse + mul r0, r8, r8 ; sum * sum + str r11, [r6] ; store sse + sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) + + ldmfd sp!, {r4-r12, pc} + + ENDP + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +|aom_variance8x8_media| PROC + + push {r4-r10, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #8 ; set loop counter to 8 (=block height) + mov r4, #0 ; initialize sum = 0 + mov r5, #0 ; initialize sse = 0 + +loop8x8 + ; 1st 4 pixels + ldr r6, [r0, #0x0] ; load 4 src pixels + ldr r7, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r6, r7 ; calculate difference + pld [r0, r1, lsl #1] + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r0, #0x4] ; load 4 src pixels + ldr r7, [r2, #0x4] ; load 4 ref pixels + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r6, r7 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r10, r8, lr ; select bytes with positive difference + usub8 r9, r7, r6 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r6, r10, lr ; calculate sum of positive differences + usad8 r7, r8, lr ; calculate sum of negative differences + orr r8, r8, r10 ; differences of all 4 pixels + + ; calculate total sum + add r4, r4, r6 ; add positive differences to sum + sub r4, r4, r7 ; subtract negative differences from sum + + ; calculate sse + uxtb16 r7, r8 ; byte (two pixels) to halfwords + uxtb16 r10, r8, ror #8 ; another two pixels to halfwords + smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) + subs r12, r12, #1 ; next row + smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) + + bne loop8x8 + + ; return stuff + ldr r8, [sp, #32] ; get address of sse + mul r1, r4, r4 ; sum * sum + str r5, [r8] ; store sse + sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) + + pop {r4-r10, pc} + + ENDP + +; r0 unsigned char *src_ptr +; r1 int source_stride +; r2 unsigned char *ref_ptr +; r3 int recon_stride +; stack unsigned int *sse +; +;note: Based on aom_variance16x16_media. In this function, sum is never used. +; So, we can remove this part of calculation. + +|aom_mse16x16_media| PROC + + push {r4-r9, lr} + + pld [r0, r1, lsl #0] + pld [r2, r3, lsl #0] + + mov r12, #16 ; set loop counter to 16 (=block height) + mov r4, #0 ; initialize sse = 0 + +loopmse + ; 1st 4 pixels + ldr r5, [r0, #0x0] ; load 4 src pixels + ldr r6, [r2, #0x0] ; load 4 ref pixels + + mov lr, #0 ; constant zero + + usub8 r8, r5, r6 ; calculate difference + pld [r0, r1, lsl #1] + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + pld [r2, r3, lsl #1] + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0x4] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 2nd 4 pixels + ldr r6, [r2, #0x4] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + ldr r5, [r0, #0x8] ; load 4 src pixels + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 3rd 4 pixels + ldr r6, [r2, #0x8] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + ldr r5, [r0, #0xc] ; load 4 src pixels + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + + ; 4th 4 pixels + ldr r6, [r2, #0xc] ; load 4 ref pixels + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + usub8 r8, r5, r6 ; calculate difference + add r0, r0, r1 ; set src_ptr to next row + sel r7, r8, lr ; select bytes with positive difference + usub8 r9, r6, r5 ; calculate difference with reversed operands + add r2, r2, r3 ; set dst_ptr to next row + sel r8, r9, lr ; select bytes with negative difference + + ; calculate partial sums + usad8 r5, r7, lr ; calculate sum of positive differences + usad8 r6, r8, lr ; calculate sum of negative differences + orr r8, r8, r7 ; differences of all 4 pixels + + subs r12, r12, #1 ; next row + + ; calculate sse + uxtb16 r6, r8 ; byte (two pixels) to halfwords + uxtb16 r7, r8, ror #8 ; another two pixels to halfwords + smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) + smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) + + bne loopmse + + ; return stuff + ldr r1, [sp, #28] ; get address of sse + mov r0, r4 ; return sse + str r4, [r1] ; store sse + + pop {r4-r9, pc} + + ENDP + + END diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c new file mode 100644 index 000000000..dbab287e3 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/variance_neon.c @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "./aom_config.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +// w * h must be less than 2048 or local variable v_sum may overflow. +static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, + int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = + vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); + v_sse_hi = + vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum); +} + +void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, unsigned int *sse, int *sum) { + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum); +} + +unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum); + return *sse - ((sum * sum) >> 6); +} + +unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum); + return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8); +} + +unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} + +unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1); + variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride, + 32, 32, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); +} + +unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11); +} + +unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum1, sum2; + uint32_t sse1, sse2; + + variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1); + variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride, + 64, 16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride), + b_stride, 64, 16, &sse2, &sum2); + sse1 += sse2; + sum1 += sum2; + + variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride), + b_stride, 64, 16, &sse2, &sum2); + *sse = sse1 + sse2; + sum1 += sum2; + return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12); +} + +unsigned int aom_variance16x8_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 4; i++) { + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16)); + q9s32 = vmlal_s16(q9s32, d26s16, d26s16); + q10s32 = vmlal_s16(q10s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int aom_variance8x16_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, unsigned int *sse) { + int i; + uint8x8_t d0u8, d2u8, d4u8, d6u8; + int16x4_t d22s16, d23s16, d24s16, d25s16; + uint32x2_t d0u32, d10u32; + int64x1_t d0s64, d1s64; + uint16x8_t q11u16, q12u16; + int32x4_t q8s32, q9s32, q10s32; + int64x2_t q0s64, q1s64, q5s64; + + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + __builtin_prefetch(src_ptr); + + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + __builtin_prefetch(ref_ptr); + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d2u8, d6u8); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16)); + q9s32 = vmlal_s16(q9s32, d22s16, d22s16); + q10s32 = vmlal_s16(q10s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + } + + q10s32 = vaddq_s32(q10s32, q9s32); + q0s64 = vpaddlq_s32(q8s32); + q1s64 = vpaddlq_s32(q10s32); + + d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64)); + d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64)); + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0); + + d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7); + d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32); + + return vget_lane_u32(d0u32, 0); +} + +unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int64x1_t d0s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + q7s32 = vdupq_n_s32(0); + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // mse16x16_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q7s32 = vmlal_s16(q7s32, d22s16, d22s16); + q8s32 = vmlal_s16(q8s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q7s32 = vmlal_s16(q7s32, d26s16, d26s16); + q8s32 = vmlal_s16(q8s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q10s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q10s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} + +unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride) { + int16x4_t d22s16, d24s16, d26s16, d28s16; + int64x1_t d0s64; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); + d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); + d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); + d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + + q7s32 = vmull_s16(d22s16, d22s16); + q8s32 = vmull_s16(d24s16, d24s16); + q9s32 = vmull_s16(d26s16, d26s16); + q10s32 = vmull_s16(d28s16, d28s16); + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q9s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q9s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} diff --git a/third_party/aom/aom_dsp/avg.c b/third_party/aom/aom_dsp/avg.c new file mode 100644 index 000000000..eb6059705 --- /dev/null +++ b/third_party/aom/aom_dsp/avg.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +unsigned int aom_avg_8x8_c(const uint8_t *src, int stride) { + int i, j; + int sum = 0; + for (i = 0; i < 8; ++i, src += stride) + for (j = 0; j < 8; sum += src[j], ++j) { + } + + return ROUND_POWER_OF_TWO(sum, 6); +} + +unsigned int aom_avg_4x4_c(const uint8_t *src, int stride) { + int i, j; + int sum = 0; + for (i = 0; i < 4; ++i, src += stride) + for (j = 0; j < 4; sum += src[j], ++j) { + } + + return ROUND_POWER_OF_TWO(sum, 4); +} + +// src_diff: first pass, 9 bit, dynamic range [-255, 255] +// second pass, 12 bit, dynamic range [-2040, 2040] +static void hadamard_col8(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +// The order of the output coeff of the hadamard is not important. For +// optimization purposes the final transpose may be skipped. +void aom_hadamard_8x8_c(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int idx; + int16_t buffer[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit + // dynamic range [-255, 255] + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + coeff += 8; // coeff: 15 bit + // dynamic range [-16320, 16320] + ++tmp_buf; + } +} + +// In place 16x16 2D Hadamard transform +void aom_hadamard_16x16_c(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + // src_diff: 9 bit, dynamic range [-255, 255] + const int16_t *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + + // coeff: 15 bit, dynamic range [-16320, 16320] + for (idx = 0; idx < 64; ++idx) { + int16_t a0 = coeff[0]; + int16_t a1 = coeff[64]; + int16_t a2 = coeff[128]; + int16_t a3 = coeff[192]; + + int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + int16_t b3 = (a2 - a3) >> 1; + + coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] + coeff[64] = b1 + b3; + coeff[128] = b0 - b2; + coeff[192] = b1 - b3; + + ++coeff; + } +} + +// coeff: 16 bits, dynamic range [-32640, 32640]. +// length: value range {16, 64, 256, 1024}. +int aom_satd_c(const int16_t *coeff, int length) { + int i; + int satd = 0; + for (i = 0; i < length; ++i) satd += abs(coeff[i]); + + // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] + return satd; +} + +// Integer projection onto row vectors. +// height: value range {16, 32, 64}. +void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, int ref_stride, + int height) { + int idx; + const int norm_factor = height >> 1; + for (idx = 0; idx < 16; ++idx) { + int i; + hbuf[idx] = 0; + // hbuf[idx]: 14 bit, dynamic range [0, 16320]. + for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; + // hbuf[idx]: 9 bit, dynamic range [0, 510]. + hbuf[idx] /= norm_factor; + ++ref; + } +} + +// width: value range {16, 32, 64}. +int16_t aom_int_pro_col_c(const uint8_t *ref, int width) { + int idx; + int16_t sum = 0; + // sum: 14 bit, dynamic range [0, 16320] + for (idx = 0; idx < width; ++idx) sum += ref[idx]; + return sum; +} + +// ref: [0 - 510] +// src: [0 - 510] +// bwl: {2, 3, 4} +int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) { + int i; + int width = 4 << bwl; + int sse = 0, mean = 0, var; + + for (i = 0; i < width; ++i) { + int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits. + mean += diff; // mean: dynamic range 16 bits. + sse += diff * diff; // sse: dynamic range 26 bits. + } + + // (mean * mean): dynamic range 31 bits. + var = sse - ((mean * mean) >> (bwl + 2)); + return var; +} + +void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, int *min, int *max) { + int i, j; + *min = 255; + *max = 0; + for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) { + for (j = 0; j < 8; ++j) { + int diff = abs(src[j] - ref[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} + +#if CONFIG_HIGHBITDEPTH +unsigned int aom_highbd_avg_8x8_c(const uint8_t *src, int stride) { + int i, j; + int sum = 0; + const uint16_t *s = CONVERT_TO_SHORTPTR(src); + for (i = 0; i < 8; ++i, s += stride) + for (j = 0; j < 8; sum += s[j], ++j) { + } + + return ROUND_POWER_OF_TWO(sum, 6); +} + +unsigned int aom_highbd_avg_4x4_c(const uint8_t *src, int stride) { + int i, j; + int sum = 0; + const uint16_t *s = CONVERT_TO_SHORTPTR(src); + for (i = 0; i < 4; ++i, s += stride) + for (j = 0; j < 4; sum += s[j], ++j) { + } + + return ROUND_POWER_OF_TWO(sum, 4); +} + +void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + int i, j; + const uint16_t *s = CONVERT_TO_SHORTPTR(s8); + const uint16_t *d = CONVERT_TO_SHORTPTR(d8); + *min = 255; + *max = 0; + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j] - d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c new file mode 100644 index 000000000..96c4cb436 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_reader.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/bitreader.h" + +#include "av1/common/common.h" + +// Inverse recenters a non-negative literal v around a reference r +static uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) { + if (v > (r << 1)) + return v; + else if ((v & 1) == 0) + return (v >> 1) + r; + else + return r - ((v + 1) >> 1); +} + +// Inverse recenters a non-negative literal v in [0, n-1] around a +// reference r also in [0, n-1] +static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { + if ((r << 1) <= n) { + return inv_recenter_nonneg(r, v); + } else { + return n - 1 - inv_recenter_nonneg(n - 1 - r, v); + } +} + +int16_t aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits) { + if (aom_read_bit(r, NULL)) { + int s = aom_read_bit(r, NULL); + int16_t x = aom_read_literal(r, mag_bits, NULL) + 1; + return (s > 0 ? -x : x); + } else { + return 0; + } +} + +uint16_t aom_read_primitive_quniform(aom_reader *r, uint16_t n) { + if (n <= 1) return 0; + const int l = get_msb(n - 1) + 1; + const int m = (1 << l) - n; + const int v = aom_read_literal(r, l - 1, NULL); + return v < m ? v : (v << 1) - m + aom_read_bit(r, NULL); +} + +uint16_t aom_read_primitive_refbilevel(aom_reader *r, uint16_t n, uint16_t p, + uint16_t ref) { + if (n <= 1) return 0; + assert(p > 0 && p <= n); + assert(ref < n); + int lolimit = ref - p / 2; + const int hilimit = lolimit + p - 1; + if (lolimit < 0) { + lolimit = 0; + } else if (hilimit >= n) { + lolimit = n - p; + } + int v; + if (aom_read_bit(r, NULL)) { + v = aom_read_primitive_quniform(r, p) + lolimit; + } else { + v = aom_read_primitive_quniform(r, n - p); + if (v >= lolimit) v += p; + } + return v; +} + +// Decode finite subexponential code that for a symbol v in [0, n-1] with +// parameter k +uint16_t aom_read_primitive_subexpfin(aom_reader *r, uint16_t n, uint16_t k) { + int i = 0; + int mk = 0; + uint16_t v; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + v = aom_read_primitive_quniform(r, n - mk) + mk; + break; + } else { + if (aom_read_bit(r, NULL)) { + i = i + 1; + mk += a; + } else { + v = aom_read_literal(r, b, NULL) + mk; + break; + } + } + } + return v; +} + +// Decode finite subexponential code that for a symbol v in [0, n-1] with +// parameter k +// based on a reference ref also in [0, n-1]. +uint16_t aom_read_primitive_refsubexpfin(aom_reader *r, uint16_t n, uint16_t k, + uint16_t ref) { + return inv_recenter_finite_nonneg(n, ref, + aom_read_primitive_subexpfin(r, n, k)); +} + +// Decode finite subexponential code that for a symbol v in [-(n-1), n-1] with +// parameter k based on a reference ref also in [-(n-1), n-1]. +int16_t aom_read_signed_primitive_refsubexpfin(aom_reader *r, uint16_t n, + uint16_t k, int16_t ref) { + ref += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref) - n + 1; +} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h new file mode 100644 index 000000000..738d91da8 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_reader.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BINARY_CODES_READER_H_ +#define AOM_DSP_BINARY_CODES_READER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/bitreader.h" + +int16_t aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits); + +uint16_t aom_read_primitive_quniform(aom_reader *r, uint16_t n); +uint16_t aom_read_primitive_refbilevel(aom_reader *r, uint16_t n, uint16_t p, + uint16_t ref); +uint16_t aom_read_primitive_subexpfin(aom_reader *r, uint16_t n, uint16_t k); +uint16_t aom_read_primitive_refsubexpfin(aom_reader *r, uint16_t n, uint16_t k, + uint16_t ref); +int16_t aom_read_signed_primitive_refsubexpfin(aom_reader *r, uint16_t n, + uint16_t k, int16_t ref); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BINARY_CODES_READER_H_ diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c new file mode 100644 index 000000000..91e807b29 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_writer.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/bitwriter.h" + +#include "av1/common/common.h" + +// Recenters a non-negative literal v around a reference r +static uint16_t recenter_nonneg(uint16_t r, uint16_t v) { + if (v > (r << 1)) + return v; + else if (v >= r) + return ((v - r) << 1); + else + return ((r - v) << 1) - 1; +} + +// Recenters a non-negative literal v in [0, n-1] around a +// reference r also in [0, n-1] +static uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { + if ((r << 1) <= n) { + return recenter_nonneg(r, v); + } else { + return recenter_nonneg(n - 1 - r, n - 1 - v); + } +} + +// Codes a symbol v in [-2^mag_bits, 2^mag_bits]. +// mag_bits is number of bits for magnitude. The alphabet is of size +// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to +// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide +// and 1 more bit for the sign if non-zero. +void aom_write_primitive_symmetric(aom_writer *w, int16_t v, + unsigned int abs_bits) { + if (v == 0) { + aom_write_bit(w, 0); + } else { + const int x = abs(v); + const int s = v < 0; + aom_write_bit(w, 1); + aom_write_bit(w, s); + aom_write_literal(w, x - 1, abs_bits); + } +} + +int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) { + return (v == 0 ? 1 : abs_bits + 2); +} + +// Encodes a value v in [0, n-1] quasi-uniformly +void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { + if (n <= 1) return; + const int l = get_msb(n - 1) + 1; + const int m = (1 << l) - n; + if (v < m) { + aom_write_literal(w, v, l - 1); + } else { + aom_write_literal(w, m + ((v - m) >> 1), l - 1); + aom_write_bit(w, (v - m) & 1); + } +} + +int aom_count_primitive_quniform(uint16_t n, uint16_t v) { + if (n <= 1) return 0; + const int l = get_msb(n - 1) + 1; + const int m = (1 << l) - n; + return v < m ? l - 1 : l; +} + +// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1] +// The closest p values of v from ref are coded using a p-ary quasi-unoform +// short code while the remaining n-p values are coded with a longer code. +void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p, + uint16_t ref, uint16_t v) { + if (n <= 1) return; + assert(p > 0 && p <= n); + assert(ref < n); + int lolimit = ref - p / 2; + int hilimit = lolimit + p - 1; + if (lolimit < 0) { + lolimit = 0; + hilimit = p - 1; + } else if (hilimit >= n) { + hilimit = n - 1; + lolimit = n - p; + } + if (v >= lolimit && v <= hilimit) { + aom_write_bit(w, 1); + v = v - lolimit; + aom_write_primitive_quniform(w, p, v); + } else { + aom_write_bit(w, 0); + if (v > hilimit) v -= p; + aom_write_primitive_quniform(w, n - p, v); + } +} + +int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref, + uint16_t v) { + if (n <= 1) return 0; + assert(p > 0 && p <= n); + assert(ref < n); + int lolimit = ref - p / 2; + int hilimit = lolimit + p - 1; + if (lolimit < 0) { + lolimit = 0; + hilimit = p - 1; + } else if (hilimit >= n) { + hilimit = n - 1; + lolimit = n - p; + } + int count = 0; + if (v >= lolimit && v <= hilimit) { + count++; + v = v - lolimit; + count += aom_count_primitive_quniform(p, v); + } else { + count++; + if (v > hilimit) v -= p; + count += aom_count_primitive_quniform(n - p, v); + } + return count; +} + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t v) { + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + aom_write_primitive_quniform(w, n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + aom_write_bit(w, t); + if (t) { + i = i + 1; + mk += a; + } else { + aom_write_literal(w, v - mk, b); + break; + } + } + } +} + +int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) { + int count = 0; + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + count += aom_count_primitive_quniform(n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + count++; + if (t) { + i = i + 1; + mk += a; + } else { + count += b; + break; + } + } + } + return count; +} + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +// based on a reference ref also in [0, n-1]. +// Recenters symbol around r first and then uses a finite subexponential code. +void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, + int16_t ref, int16_t v) { + aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v)); +} + +void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, + uint16_t k, uint16_t ref, + uint16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v); +} + +int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, + uint16_t v) { + return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v)); +} + +int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, + int16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v); +} diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h new file mode 100644 index 000000000..ab5ccbf15 --- /dev/null +++ b/third_party/aom/aom_dsp/binary_codes_writer.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BINARY_CODES_WRITER_H_ +#define AOM_DSP_BINARY_CODES_WRITER_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/bitwriter.h" + +// Codes a symbol v in [-2^mag_bits, 2^mag_bits] +// mag_bits is number of bits for magnitude. The alphabet is of size +// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to +// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide +// and 1 more bit for the sign if non-zero. +void aom_write_primitive_symmetric(aom_writer *w, int16_t v, + unsigned int mag_bits); + +// Encodes a value v in [0, n-1] quasi-uniformly +void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v); + +// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1] +// The closest p values of v from ref are coded using a p-ary quasi-unoform +// short code while the remaining n-p values are coded with a longer code. +void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p, + uint16_t ref, uint16_t v); + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t v); + +// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k +// based on a reference ref also in [0, n-1]. +void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, + uint16_t ref, uint16_t v); + +// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with +// parameter k based on a reference ref also in [-(n-1), n-1]. +void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, + uint16_t k, int16_t ref, + int16_t v); + +// Functions that counts bits for the above primitives +int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits); +int aom_count_primitive_quniform(uint16_t n, uint16_t v); +int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref, + uint16_t v); +int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v); +int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, + uint16_t v); +int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, + int16_t v); +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BINARY_CODES_WRITER_H_ diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h new file mode 100644 index 000000000..9cd34dd48 --- /dev/null +++ b/third_party/aom/aom_dsp/bitreader.h @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BITREADER_H_ +#define AOM_DSP_BITREADER_H_ + +#include +#include + +#include "./aom_config.h" +#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL +#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL." +#endif + +#include "aom/aomdx.h" +#include "aom/aom_integer.h" +#if CONFIG_ANS +#include "aom_dsp/ansreader.h" +#elif CONFIG_DAALA_EC +#include "aom_dsp/daalaboolreader.h" +#else +#include "aom_dsp/dkboolreader.h" +#endif +#include "aom_dsp/prob.h" +#include "av1/common/odintrin.h" + +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#define ACCT_STR_NAME acct_str +#define ACCT_STR_PARAM , const char *ACCT_STR_NAME +#define ACCT_STR_ARG(s) , s +#else +#define ACCT_STR_PARAM +#define ACCT_STR_ARG(s) +#endif + +#define aom_read(r, prob, ACCT_STR_NAME) \ + aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_bit(r, ACCT_STR_NAME) \ + aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \ + aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_literal(r, bits, ACCT_STR_NAME) \ + aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \ + aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \ + aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_ANS +typedef struct AnsDecoder aom_reader; +#elif CONFIG_DAALA_EC +typedef struct daala_reader aom_reader; +#else +typedef struct aom_dk_reader aom_reader; +#endif + +static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer, + size_t size, aom_decrypt_cb decrypt_cb, + void *decrypt_state) { +#if CONFIG_ANS + (void)decrypt_cb; + (void)decrypt_state; + if (size > INT_MAX) return 1; + return ans_read_init(r, buffer, (int)size); +#elif CONFIG_DAALA_EC + (void)decrypt_cb; + (void)decrypt_state; + return aom_daala_reader_init(r, buffer, (int)size); +#else + return aom_dk_reader_init(r, buffer, size, decrypt_cb, decrypt_state); +#endif +} + +static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) { +#if CONFIG_ANS + (void)r; + assert(0 && "Use the raw buffer size with ANS"); + return NULL; +#elif CONFIG_DAALA_EC + return aom_daala_reader_find_end(r); +#else + return aom_dk_reader_find_end(r); +#endif +} + +static INLINE int aom_reader_has_error(aom_reader *r) { +#if CONFIG_ANS + return ans_reader_has_error(r); +#elif CONFIG_DAALA_EC + return aom_daala_reader_has_error(r); +#else + return aom_dk_reader_has_error(r); +#endif +} + +// Returns the position in the bit reader in bits. +static INLINE uint32_t aom_reader_tell(const aom_reader *r) { +#if CONFIG_ANS + (void)r; + assert(0 && "aom_reader_tell() is unimplemented for ANS"); + return 0; +#elif CONFIG_DAALA_EC + return aom_daala_reader_tell(r); +#else + return aom_dk_reader_tell(r); +#endif +} + +// Returns the position in the bit reader in 1/8th bits. +static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) { +#if CONFIG_ANS + (void)r; + assert(0 && "aom_reader_tell_frac() is unimplemented for ANS"); + return 0; +#elif CONFIG_DAALA_EC + return aom_daala_reader_tell_frac(r); +#else + return aom_dk_reader_tell_frac(r); +#endif +} + +#if CONFIG_ACCOUNTING +static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) { + if (r->accounting != NULL) { + uint32_t tell_frac; + tell_frac = aom_reader_tell_frac(r); + aom_accounting_record(r->accounting, ACCT_STR_NAME, + tell_frac - r->accounting->last_tell_frac); + r->accounting->last_tell_frac = tell_frac; + } +} + +static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) { + if (r->accounting != NULL) { + r->accounting->syms.num_multi_syms += !is_binary; + r->accounting->syms.num_binary_syms += !!is_binary; + } +} +#endif + +static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) { + int ret; +#if CONFIG_ANS + ret = rabs_read(r, prob); +#elif CONFIG_DAALA_EC + ret = aom_daala_read(r, prob); +#else + ret = aom_dk_read(r, prob); +#endif +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); + aom_update_symb_counts(r, 1); +#endif + return ret; +} + +static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) { + int ret; +#if CONFIG_ANS + ret = rabs_read_bit(r); // Non trivial optimization at half probability +#elif CONFIG_DAALA_EC && CONFIG_RAWBITS + // Note this uses raw bits and is not the same as aom_daala_read(r, 128); + // Calls to this function are omitted from raw symbol accounting. + ret = aom_daala_read_bit(r); +#else + ret = aom_read(r, 128, NULL); // aom_prob_half +#endif +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); +#endif + return ret; +} + +static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) { + int literal = 0, bit; + + for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit; +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); +#endif + return literal; +} + +static INLINE int aom_read_tree_as_bits(aom_reader *r, + const aom_tree_index *tree, + const aom_prob *probs) { + aom_tree_index i = 0; + + while ((i = tree[i + aom_read(r, probs[i >> 1], NULL)]) > 0) continue; + return -i; +} + +#if CONFIG_EC_MULTISYMBOL +static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf, + int nsymbs ACCT_STR_PARAM) { + int ret; +#if CONFIG_ANS + (void)nsymbs; + ret = rans_read(r, cdf); +#elif CONFIG_DAALA_EC + ret = daala_read_symbol(r, cdf, nsymbs); +#else +#error \ + "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \ + "coder. Enable daala_ec or ans for a valid configuration." +#endif + +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); + aom_update_symb_counts(r, (nsymbs == 2)); +#endif + return ret; +} + +static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, + int nsymbs ACCT_STR_PARAM) { + int ret; + ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); +#if CONFIG_EC_ADAPT + update_cdf(cdf, ret, nsymbs); +#endif + return ret; +} + +static INLINE int aom_read_tree_as_cdf(aom_reader *r, + const aom_tree_index *tree, + const aom_prob *probs) { + aom_tree_index i = 0; + do { + aom_cdf_prob cdf[16]; + aom_tree_index index[16]; + int path[16]; + int dist[16]; + int nsymbs; + int symb; + nsymbs = tree_to_cdf(tree, probs, i, cdf, index, path, dist); + symb = aom_read_cdf(r, cdf, nsymbs, NULL); + OD_ASSERT(symb >= 0 && symb < nsymbs); + i = index[symb]; + } while (i > 0); + return -i; +} +#endif // CONFIG_EC_MULTISYMBOL + +static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree, + const aom_prob *probs ACCT_STR_PARAM) { + int ret; +#if CONFIG_EC_MULTISYMBOL + ret = aom_read_tree_as_cdf(r, tree, probs); +#else + ret = aom_read_tree_as_bits(r, tree, probs); +#endif +#if CONFIG_ACCOUNTING + if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); +#endif + return ret; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BITREADER_H_ diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c new file mode 100644 index 000000000..009682b4c --- /dev/null +++ b/third_party/aom/aom_dsp/bitreader_buffer.c @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include "./aom_config.h" +#include "./bitreader_buffer.h" + +size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) { + return (rb->bit_offset + 7) >> 3; +} + +int aom_rb_read_bit(struct aom_read_bit_buffer *rb) { + const uint32_t off = rb->bit_offset; + const uint32_t p = off >> 3; + const int q = 7 - (int)(off & 0x7); + if (rb->bit_buffer + p < rb->bit_buffer_end) { + const int bit = (rb->bit_buffer[p] >> q) & 1; + rb->bit_offset = off + 1; + return bit; + } else { + rb->error_handler(rb->error_handler_data); + return 0; + } +} + +int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { + int value = 0, bit; + for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit; + return value; +} + +int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) { + const int value = aom_rb_read_literal(rb, bits); + return aom_rb_read_bit(rb) ? -value : value; +} + +int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) { + const int nbits = sizeof(unsigned) * 8 - bits - 1; + const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits; + return ((int)value) >> nbits; +} diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h new file mode 100644 index 000000000..22187357e --- /dev/null +++ b/third_party/aom/aom_dsp/bitreader_buffer.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BITREADER_BUFFER_H_ +#define AOM_DSP_BITREADER_BUFFER_H_ + +#include + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*aom_rb_error_handler)(void *data); + +struct aom_read_bit_buffer { + const uint8_t *bit_buffer; + const uint8_t *bit_buffer_end; + uint32_t bit_offset; + + void *error_handler_data; + aom_rb_error_handler error_handler; +}; + +size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb); + +int aom_rb_read_bit(struct aom_read_bit_buffer *rb); + +int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits); + +int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits); + +int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BITREADER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h new file mode 100644 index 000000000..6e3fac260 --- /dev/null +++ b/third_party/aom/aom_dsp/bitwriter.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BITWRITER_H_ +#define AOM_DSP_BITWRITER_H_ + +#include +#include "./aom_config.h" +#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL +#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL" +#endif + +#if CONFIG_ANS +#include "aom_dsp/buf_ans.h" +#elif CONFIG_DAALA_EC +#include "aom_dsp/daalaboolwriter.h" +#else +#include "aom_dsp/dkboolwriter.h" +#endif +#include "aom_dsp/prob.h" + +#if CONFIG_RD_DEBUG +#include "av1/common/blockd.h" +#include "av1/encoder/cost.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_ANS +typedef struct BufAnsCoder aom_writer; +#elif CONFIG_DAALA_EC +typedef struct daala_writer aom_writer; +#else +typedef struct aom_dk_writer aom_writer; +#endif + +typedef struct TOKEN_STATS { + int cost; +#if CONFIG_VAR_TX +#if CONFIG_RD_DEBUG + int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE]; +#endif +#endif +} TOKEN_STATS; + +static INLINE void init_token_stats(TOKEN_STATS *token_stats) { +#if CONFIG_VAR_TX +#if CONFIG_RD_DEBUG + int r, c; + for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { + for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) { + token_stats->txb_coeff_cost_map[r][c] = 0; + } + } +#endif +#endif + token_stats->cost = 0; +} + +static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) { +#if CONFIG_ANS + (void)bc; + (void)buffer; + assert(0 && "buf_ans requires a more complicated startup procedure"); +#elif CONFIG_DAALA_EC + aom_daala_start_encode(bc, buffer); +#else + aom_dk_start_encode(bc, buffer); +#endif +} + +static INLINE void aom_stop_encode(aom_writer *bc) { +#if CONFIG_ANS + (void)bc; + assert(0 && "buf_ans requires a more complicated shutdown procedure"); +#elif CONFIG_DAALA_EC + aom_daala_stop_encode(bc); +#else + aom_dk_stop_encode(bc); +#endif +} + +static INLINE void aom_write(aom_writer *br, int bit, int probability) { +#if CONFIG_ANS + buf_rabs_write(br, bit, probability); +#elif CONFIG_DAALA_EC + aom_daala_write(br, bit, probability); +#else + aom_dk_write(br, bit, probability); +#endif +} + +static INLINE void aom_write_record(aom_writer *br, int bit, int probability, + TOKEN_STATS *token_stats) { + aom_write(br, bit, probability); +#if CONFIG_RD_DEBUG + token_stats->cost += av1_cost_bit(probability, bit); +#else + (void)token_stats; +#endif +} + +static INLINE void aom_write_bit(aom_writer *w, int bit) { +#if CONFIG_ANS + buf_rabs_write_bit(w, bit); +#elif CONFIG_DAALA_EC && CONFIG_RAWBITS + // Note this uses raw bits and is not the same as aom_daala_write(r, 128); + aom_daala_write_bit(w, bit); +#else + aom_write(w, bit, 128); // aom_prob_half +#endif +} + +static INLINE void aom_write_bit_record(aom_writer *w, int bit, + TOKEN_STATS *token_stats) { + aom_write_bit(w, bit); +#if CONFIG_RD_DEBUG + token_stats->cost += av1_cost_bit(128, bit); // aom_prob_half +#else + (void)token_stats; +#endif +} + +static INLINE void aom_write_literal(aom_writer *w, int data, int bits) { + int bit; + + for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit)); +} + +static INLINE void aom_write_tree_as_bits(aom_writer *w, + const aom_tree_index *tr, + const aom_prob *probs, int bits, + int len, aom_tree_index i) { + do { + const int bit = (bits >> --len) & 1; + aom_write(w, bit, probs[i >> 1]); + i = tr[i + bit]; + } while (len); +} + +static INLINE void aom_write_tree_as_bits_record( + aom_writer *w, const aom_tree_index *tr, const aom_prob *probs, int bits, + int len, aom_tree_index i, TOKEN_STATS *token_stats) { + do { + const int bit = (bits >> --len) & 1; + aom_write_record(w, bit, probs[i >> 1], token_stats); + i = tr[i + bit]; + } while (len); +} + +#if CONFIG_EC_MULTISYMBOL +static INLINE void aom_write_cdf(aom_writer *w, int symb, + const aom_cdf_prob *cdf, int nsymbs) { +#if CONFIG_ANS + (void)nsymbs; + assert(cdf); + const aom_cdf_prob cum_prob = symb > 0 ? cdf[symb - 1] : 0; + const aom_cdf_prob prob = cdf[symb] - cum_prob; + buf_rans_write(w, cum_prob, prob); +#elif CONFIG_DAALA_EC + daala_write_symbol(w, symb, cdf, nsymbs); +#else +#error \ + "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \ + "coder. Enable daala_ec or ans for a valid configuration." +#endif +} + +static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, + int nsymbs) { + aom_write_cdf(w, symb, cdf, nsymbs); +#if CONFIG_EC_ADAPT + update_cdf(cdf, symb, nsymbs); +#endif +} + +static INLINE void aom_write_tree_as_cdf(aom_writer *w, + const aom_tree_index *tree, + const aom_prob *probs, int bits, + int len, aom_tree_index i) { + aom_tree_index root; + root = i; + do { + aom_cdf_prob cdf[16]; + aom_tree_index index[16]; + int path[16]; + int dist[16]; + int nsymbs; + int symb; + int j; + /* Compute the CDF of the binary tree using the given probabilities. */ + nsymbs = tree_to_cdf(tree, probs, root, cdf, index, path, dist); + /* Find the symbol to code. */ + symb = -1; + for (j = 0; j < nsymbs; j++) { + /* If this symbol codes a leaf node, */ + if (index[j] <= 0) { + if (len == dist[j] && path[j] == bits) { + symb = j; + break; + } + } else { + if (len > dist[j] && path[j] == bits >> (len - dist[j])) { + symb = j; + break; + } + } + } + OD_ASSERT(symb != -1); + aom_write_cdf(w, symb, cdf, nsymbs); + bits &= (1 << (len - dist[symb])) - 1; + len -= dist[symb]; + } while (len); +} + +#endif // CONFIG_EC_MULTISYMBOL + +static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree, + const aom_prob *probs, int bits, int len, + aom_tree_index i) { +#if CONFIG_EC_MULTISYMBOL + aom_write_tree_as_cdf(w, tree, probs, bits, len, i); +#else + aom_write_tree_as_bits(w, tree, probs, bits, len, i); +#endif +} + +static INLINE void aom_write_tree_record(aom_writer *w, + const aom_tree_index *tree, + const aom_prob *probs, int bits, + int len, aom_tree_index i, + TOKEN_STATS *token_stats) { +#if CONFIG_EC_MULTISYMBOL + (void)token_stats; + aom_write_tree_as_cdf(w, tree, probs, bits, len, i); +#else + aom_write_tree_as_bits_record(w, tree, probs, bits, len, i, token_stats); +#endif +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BITWRITER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c new file mode 100644 index 000000000..1b3dd2913 --- /dev/null +++ b/third_party/aom/aom_dsp/bitwriter_buffer.c @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./bitwriter_buffer.h" + +uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) { + return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); +} + +void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) { + const int off = (int)wb->bit_offset; + const int p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - off % CHAR_BIT; + if (q == CHAR_BIT - 1) { + // Zero next char and write bit + wb->bit_buffer[p] = bit << q; + } else { + wb->bit_buffer[p] &= ~(1 << q); + wb->bit_buffer[p] |= bit << q; + } + wb->bit_offset = off + 1; +} + +void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) { + // Do not zero bytes but overwrite exisiting values + const int off = (int)wb->bit_offset; + const int p = off / CHAR_BIT; + const int q = CHAR_BIT - 1 - off % CHAR_BIT; + wb->bit_buffer[p] &= ~(1 << q); + wb->bit_buffer[p] |= bit << q; + wb->bit_offset = off + 1; +} + +void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) { + int bit; + for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); +} + +void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, + int bits) { + int bit; + for (bit = bits - 1; bit >= 0; bit--) + aom_wb_overwrite_bit(wb, (data >> bit) & 1); +} + +void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, + int bits) { + aom_wb_write_literal(wb, data, bits + 1); +} diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h new file mode 100644 index 000000000..1f23dc857 --- /dev/null +++ b/third_party/aom/aom_dsp/bitwriter_buffer.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BITWRITER_BUFFER_H_ +#define AOM_DSP_BITWRITER_BUFFER_H_ + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct aom_write_bit_buffer { + uint8_t *bit_buffer; + uint32_t bit_offset; +}; + +uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb); + +void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit); + +void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit); + +void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits); + +void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, + int bits); + +void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, + int bits); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_BITWRITER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h new file mode 100644 index 000000000..e5297ff83 --- /dev/null +++ b/third_party/aom/aom_dsp/blend.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BLEND_H_ +#define AOM_DSP_BLEND_H_ + +#include "aom_ports/mem.h" + +// Various blending functions and macros. +// See also the aom_blend_* functions in aom_dsp_rtcd.h + +// Alpha blending with alpha values from the range [0, 64], where 64 +// means use the first input and 0 means use the second input. + +#define AOM_BLEND_A64_ROUND_BITS 6 +#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64 + +#define AOM_BLEND_A64(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \ + AOM_BLEND_A64_ROUND_BITS) + +// Alpha blending with alpha values from the range [0, 256], where 256 +// means use the first input and 0 means use the second input. +#define AOM_BLEND_A256_ROUND_BITS 8 +#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256 + +#define AOM_BLEND_A256(a, v0, v1) \ + ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \ + AOM_BLEND_A256_ROUND_BITS) + +// Blending by averaging. +#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1) + +#endif // AOM_DSP_BLEND_H_ diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c new file mode 100644 index 000000000..99b4b8a59 --- /dev/null +++ b/third_party/aom/aom_dsp/blend_a64_hmask.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "./aom_dsp_rtcd.h" + +void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64( + mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); + } + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64( + mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]); + } + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c new file mode 100644 index 000000000..3e15542c9 --- /dev/null +++ b/third_party/aom/aom_dsp/blend_a64_mask.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/aom_dsp_common.h" + +#include "./aom_dsp_rtcd.h" + +// Blending with alpha mask. Mask values come from the range [0, 64], +// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can +// be the same as dst, or dst can be different from both sources. + +void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + int w, int subh, int subw) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w, int subh, int subw, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c new file mode 100644 index 000000000..1a5e30e31 --- /dev/null +++ b/third_party/aom/aom_dsp/blend_a64_vmask.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "./aom_dsp_rtcd.h" + +void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + for (i = 0; i < h; ++i) { + const int m = mask[i]; + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, uint32_t src0_stride, + const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + int i, j; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); + (void)bd; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + for (i = 0; i < h; ++i) { + const int m = mask[i]; + for (j = 0; j < w; ++j) { + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/buf_ans.c b/third_party/aom/aom_dsp/buf_ans.c new file mode 100644 index 000000000..8fe1ff763 --- /dev/null +++ b/third_party/aom/aom_dsp/buf_ans.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/buf_ans.h" +#include "aom_mem/aom_mem.h" +#include "aom/internal/aom_codec_internal.h" + +void aom_buf_ans_alloc(struct BufAnsCoder *c, + struct aom_internal_error_info *error, int size) { + c->error = error; + c->size = size; + assert(c->size > 1); + AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf))); + // Initialize to overfull to trigger the assert in write. + c->offset = c->size + 1; +} + +void aom_buf_ans_free(struct BufAnsCoder *c) { + aom_free(c->buf); + c->buf = NULL; + c->size = 0; +} + +#if !ANS_MAX_SYMBOLS +void aom_buf_ans_grow(struct BufAnsCoder *c) { + struct buffered_ans_symbol *new_buf = NULL; + int new_size = c->size * 2; + AOM_CHECK_MEM_ERROR(c->error, new_buf, + aom_malloc(new_size * sizeof(*new_buf))); + memcpy(new_buf, c->buf, c->size * sizeof(*c->buf)); + aom_free(c->buf); + c->buf = new_buf; + c->size = new_size; +} +#endif + +void aom_buf_ans_flush(struct BufAnsCoder *const c) { + int offset; +#if ANS_MAX_SYMBOLS + if (c->offset == 0) return; +#endif + assert(c->offset > 0); + offset = c->offset - 1; + // Code the first symbol such that it brings the state to the smallest normal + // state from an initial state that would have been a subnormal/refill state. + if (c->buf[offset].method == ANS_METHOD_RANS) { + c->ans.state += c->buf[offset].val_start; + } else { + c->ans.state += c->buf[offset].val_start ? c->buf[offset].prob : 0; + } + for (offset = offset - 1; offset >= 0; --offset) { + if (c->buf[offset].method == ANS_METHOD_RANS) { + rans_write(&c->ans, c->buf[offset].val_start, c->buf[offset].prob); + } else { + rabs_write(&c->ans, (uint8_t)c->buf[offset].val_start, + (AnsP8)c->buf[offset].prob); + } + } + c->offset = 0; + c->output_bytes += ans_write_end(&c->ans); +} diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h new file mode 100644 index 000000000..0768506b3 --- /dev/null +++ b/third_party/aom/aom_dsp/buf_ans.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_BUF_ANS_H_ +#define AOM_DSP_BUF_ANS_H_ +// Buffered forward ANS writer. +// Symbols are written to the writer in forward (decode) order and serialized +// backwards due to ANS's stack like behavior. + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/ans.h" +#include "aom_dsp/answriter.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#define ANS_METHOD_RABS 0 +#define ANS_METHOD_RANS 1 + +struct buffered_ans_symbol { + unsigned int method : 1; // one of ANS_METHOD_RABS or ANS_METHOD_RANS + // TODO(aconverse): Should be possible to write this in terms of start for ABS + unsigned int val_start : RANS_PROB_BITS; // Boolean value for ABS + // start in symbol cycle for Rans + unsigned int prob : RANS_PROB_BITS; // Probability of this symbol +}; + +struct BufAnsCoder { + struct aom_internal_error_info *error; + struct buffered_ans_symbol *buf; + struct AnsCoder ans; + int size; + int offset; + int output_bytes; +#if ANS_MAX_SYMBOLS + int window_size; +#endif +}; + +// Allocate a buffered ANS coder to store size symbols. +// When ANS_MAX_SYMBOLS is turned on, the size is the fixed size of each ANS +// partition. +// When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the +// buffer will grow on demand +void aom_buf_ans_alloc(struct BufAnsCoder *c, + struct aom_internal_error_info *error, int hint); + +void aom_buf_ans_free(struct BufAnsCoder *c); + +#if !ANS_MAX_SYMBOLS +void aom_buf_ans_grow(struct BufAnsCoder *c); +#endif + +void aom_buf_ans_flush(struct BufAnsCoder *const c); + +static INLINE void buf_ans_write_init(struct BufAnsCoder *const c, + uint8_t *const output_buffer) { + c->offset = 0; + c->output_bytes = 0; + ans_write_init(&c->ans, output_buffer); +} + +static INLINE void buf_rabs_write(struct BufAnsCoder *const c, uint8_t val, + AnsP8 prob) { + assert(c->offset <= c->size); +#if !ANS_MAX_SYMBOLS + if (c->offset == c->size) { + aom_buf_ans_grow(c); + } +#endif + c->buf[c->offset].method = ANS_METHOD_RABS; + c->buf[c->offset].val_start = val; + c->buf[c->offset].prob = prob; + ++c->offset; +#if ANS_MAX_SYMBOLS + if (c->offset == c->size) aom_buf_ans_flush(c); +#endif +} + +// Buffer one symbol for encoding using rANS. +// cum_prob: The cumulative probability before this symbol (the offset of +// the symbol in the symbol cycle) +// prob: The probability of this symbol (l_s from the paper) +// RANS_PRECISION takes the place of m from the paper. +static INLINE void buf_rans_write(struct BufAnsCoder *const c, + aom_cdf_prob cum_prob, aom_cdf_prob prob) { + assert(c->offset <= c->size); +#if !ANS_MAX_SYMBOLS + if (c->offset == c->size) { + aom_buf_ans_grow(c); + } +#endif + c->buf[c->offset].method = ANS_METHOD_RANS; + c->buf[c->offset].val_start = cum_prob; + c->buf[c->offset].prob = prob; + ++c->offset; +#if ANS_MAX_SYMBOLS + if (c->offset == c->size) aom_buf_ans_flush(c); +#endif +} + +static INLINE void buf_rabs_write_bit(struct BufAnsCoder *c, int bit) { + buf_rabs_write(c, bit, 128); +} + +static INLINE void buf_rabs_write_literal(struct BufAnsCoder *c, int literal, + int bits) { + int bit; + + assert(bits < 31); + for (bit = bits - 1; bit >= 0; bit--) + buf_rabs_write_bit(c, 1 & (literal >> bit)); +} + +static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) { + assert(c->offset == 0); + return c->output_bytes; +} +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_DSP_BUF_ANS_H_ diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c new file mode 100644 index 000000000..0fc7b14a5 --- /dev/null +++ b/third_party/aom/aom_dsp/daalaboolreader.c @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/daalaboolreader.h" + +int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) { + if (size && !buffer) { + return 1; + } + r->buffer_end = buffer + size; + r->buffer = buffer; + od_ec_dec_init(&r->ec, buffer, size - 1); +#if CONFIG_ACCOUNTING + r->accounting = NULL; +#endif + return 0; +} + +const uint8_t *aom_daala_reader_find_end(daala_reader *r) { + return r->buffer_end; +} + +uint32_t aom_daala_reader_tell(const daala_reader *r) { + return od_ec_dec_tell(&r->ec); +} + +uint32_t aom_daala_reader_tell_frac(const daala_reader *r) { + return od_ec_dec_tell_frac(&r->ec); +} diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h new file mode 100644 index 000000000..428d74db0 --- /dev/null +++ b/third_party/aom/aom_dsp/daalaboolreader.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_DAALABOOLREADER_H_ +#define AOM_DSP_DAALABOOLREADER_H_ + +#include "aom/aom_integer.h" +#include "aom_dsp/entdec.h" +#include "aom_dsp/prob.h" +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif +#if CONFIG_BITSTREAM_DEBUG +#include +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#ifdef __cplusplus +extern "C" { +#endif + +struct daala_reader { + const uint8_t *buffer; + const uint8_t *buffer_end; + od_ec_dec ec; +#if CONFIG_ACCOUNTING + Accounting *accounting; +#endif +}; + +typedef struct daala_reader daala_reader; + +int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size); +const uint8_t *aom_daala_reader_find_end(daala_reader *r); +uint32_t aom_daala_reader_tell(const daala_reader *r); +uint32_t aom_daala_reader_tell_frac(const daala_reader *r); + +static INLINE int aom_daala_read(daala_reader *r, int prob) { + int bit; +#if CONFIG_EC_SMALLMUL + int p = (0x7FFFFF - (prob << 15) + prob) >> 8; +#else + int p = ((prob << 15) + 256 - prob) >> 8; +#endif +#if CONFIG_BITSTREAM_DEBUG +/*{ + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + if (frame_idx == 0 && queue_r == 0) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_r %d queue_r %d\n", + frame_idx, queue_r); + } +}*/ +#endif + + bit = od_ec_decode_bool_q15(&r->ec, p); + +#if CONFIG_BITSTREAM_DEBUG + { + int i; + int ref_bit, ref_nsymbs; + aom_cdf_prob ref_cdf[16]; + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs); + if (ref_nsymbs != 2) { + fprintf(stderr, + "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs " + "%d queue_r %d\n", + frame_idx, 2, ref_nsymbs, queue_r); + assert(0); + } + if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) || + (ref_cdf[1] != 32767)) { + fprintf(stderr, + "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d", + frame_idx, p, 32767, ref_cdf[0]); + for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); + fprintf(stderr, "} queue_r %d\n", queue_r); + assert(0); + } + if (bit != ref_bit) { + fprintf(stderr, + "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d " + "queue_r %d\n", + frame_idx, bit, ref_bit, queue_r); + assert(0); + } + } +#endif + + return bit; +} + +#if CONFIG_RAWBITS +static INLINE int aom_daala_read_bit(daala_reader *r) { + return od_ec_dec_bits(&r->ec, 1, "aom_bits"); +} +#endif + +static INLINE int aom_daala_reader_has_error(daala_reader *r) { + return r->ec.error; +} + +static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf, + int nsymbs) { + int symb; + symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs); + +#if CONFIG_BITSTREAM_DEBUG + { + int i; + int cdf_error = 0; + int ref_symb, ref_nsymbs; + aom_cdf_prob ref_cdf[16]; + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs); + if (nsymbs != ref_nsymbs) { + fprintf(stderr, + "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d " + "queue_r %d\n", + frame_idx, nsymbs, ref_nsymbs, queue_r); + cdf_error = 0; + assert(0); + } else { + for (i = 0; i < nsymbs; ++i) + if (cdf[i] != ref_cdf[i]) cdf_error = 1; + } + if (cdf_error) { + fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx, + cdf[0]); + for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]); + fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]); + for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]); + fprintf(stderr, "} queue_r %d\n", queue_r); + assert(0); + } + if (symb != ref_symb) { + fprintf( + stderr, + "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n", + frame_idx, symb, ref_symb, queue_r); + assert(0); + } + } +#endif + + return symb; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c new file mode 100644 index 000000000..0ba8f6ab8 --- /dev/null +++ b/third_party/aom/aom_dsp/daalaboolwriter.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/daalaboolwriter.h" + +void aom_daala_start_encode(daala_writer *br, uint8_t *source) { + br->buffer = source; + br->pos = 0; + od_ec_enc_init(&br->ec, 62025); +} + +void aom_daala_stop_encode(daala_writer *br) { + uint32_t daala_bytes; + unsigned char *daala_data; + daala_data = od_ec_enc_done(&br->ec, &daala_bytes); + memcpy(br->buffer, daala_data, daala_bytes); + br->pos = daala_bytes; + /* Prevent ec bitstream from being detected as a superframe marker. + Must always be added, so that rawbits knows the exact length of the + bitstream. */ + br->buffer[br->pos++] = 0; + od_ec_enc_clear(&br->ec); +} diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h new file mode 100644 index 000000000..bbaf53c69 --- /dev/null +++ b/third_party/aom/aom_dsp/daalaboolwriter.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_DAALABOOLWRITER_H_ +#define AOM_DSP_DAALABOOLWRITER_H_ + +#include + +#include "aom_dsp/entenc.h" +#include "aom_dsp/prob.h" +#if CONFIG_BITSTREAM_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#ifdef __cplusplus +extern "C" { +#endif + +struct daala_writer { + unsigned int pos; + uint8_t *buffer; + od_ec_enc ec; +}; + +typedef struct daala_writer daala_writer; + +void aom_daala_start_encode(daala_writer *w, uint8_t *buffer); +void aom_daala_stop_encode(daala_writer *w); + +static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) { +#if CONFIG_EC_SMALLMUL + int p = (0x7FFFFF - (prob << 15) + prob) >> 8; +#else + int p = ((prob << 15) + 256 - prob) >> 8; +#endif +#if CONFIG_BITSTREAM_DEBUG + aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 }; + /*int queue_r = 0; + int frame_idx_r = 0; + int queue_w = bitstream_queue_get_write(); + int frame_idx_w = bitstream_queue_get_frame_write(); + if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + frame_idx_w, queue_w); + }*/ + bitstream_queue_push(bit, cdf, 2); +#endif + + od_ec_encode_bool_q15(&w->ec, bit, p); +} + +#if CONFIG_RAWBITS +static INLINE void aom_daala_write_bit(daala_writer *w, int bit) { + od_ec_enc_bits(&w->ec, bit, 1); +} +#endif + +static INLINE void daala_write_symbol(daala_writer *w, int symb, + const aom_cdf_prob *cdf, int nsymbs) { +#if CONFIG_BITSTREAM_DEBUG + /*int queue_r = 0; + int frame_idx_r = 0; + int queue_w = bitstream_queue_get_write(); + int frame_idx_w = bitstream_queue_get_frame_write(); + if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + frame_idx_w, queue_w); + }*/ + bitstream_queue_push(symb, cdf, nsymbs); +#endif + + od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/third_party/aom/aom_dsp/dkboolreader.c b/third_party/aom/aom_dsp/dkboolreader.c new file mode 100644 index 000000000..288d5f1ce --- /dev/null +++ b/third_party/aom/aom_dsp/dkboolreader.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" + +#include "aom_dsp/dkboolreader.h" +#include "aom_dsp/prob.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" +#include "aom_mem/aom_mem.h" +#include "aom_util/endian_inl.h" + +static INLINE int aom_dk_read_bit(struct aom_dk_reader *r) { + return aom_dk_read(r, 128); // aom_prob_half +} + +int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer, + size_t size, aom_decrypt_cb decrypt_cb, + void *decrypt_state) { + if (size && !buffer) { + return 1; + } else { + r->buffer_end = buffer + size; + r->buffer_start = r->buffer = buffer; + r->value = 0; + r->count = -8; + r->range = 255; + r->decrypt_cb = decrypt_cb; + r->decrypt_state = decrypt_state; + aom_dk_reader_fill(r); +#if CONFIG_ACCOUNTING + r->accounting = NULL; +#endif + return aom_dk_read_bit(r) != 0; // marker bit + } +} + +void aom_dk_reader_fill(struct aom_dk_reader *r) { + const uint8_t *const buffer_end = r->buffer_end; + const uint8_t *buffer = r->buffer; + const uint8_t *buffer_start = buffer; + BD_VALUE value = r->value; + int count = r->count; + const size_t bytes_left = buffer_end - buffer; + const size_t bits_left = bytes_left * CHAR_BIT; + int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); + + if (r->decrypt_cb) { + size_t n = AOMMIN(sizeof(r->clear_buffer), bytes_left); + r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n); + buffer = r->clear_buffer; + buffer_start = r->clear_buffer; + } + if (bits_left > BD_VALUE_SIZE) { + const int bits = (shift & 0xfffffff8) + CHAR_BIT; + BD_VALUE nv; + BD_VALUE big_endian_values; + memcpy(&big_endian_values, buffer, sizeof(BD_VALUE)); +#if SIZE_MAX == 0xffffffffffffffffULL + big_endian_values = HToBE64(big_endian_values); +#else + big_endian_values = HToBE32(big_endian_values); +#endif + nv = big_endian_values >> (BD_VALUE_SIZE - bits); + count += bits; + buffer += (bits >> 3); + value = r->value | (nv << (shift & 0x7)); + } else { + const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left); + int loop_end = 0; + if (bits_over >= 0) { + count += LOTS_OF_BITS; + loop_end = bits_over; + } + + if (bits_over < 0 || bits_left) { + while (shift >= loop_end) { + count += CHAR_BIT; + value |= (BD_VALUE)*buffer++ << shift; + shift -= CHAR_BIT; + } + } + } + + // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption, + // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than + // assign 'buffer' to 'r->buffer'. + r->buffer += buffer - buffer_start; + r->value = value; + r->count = count; +} + +const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r) { + // Find the end of the coded buffer + while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) { + r->count -= CHAR_BIT; + r->buffer--; + } + return r->buffer; +} diff --git a/third_party/aom/aom_dsp/dkboolreader.h b/third_party/aom/aom_dsp/dkboolreader.h new file mode 100644 index 000000000..f0bc84381 --- /dev/null +++ b/third_party/aom/aom_dsp/dkboolreader.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_DKBOOLREADER_H_ +#define AOM_DSP_DKBOOLREADER_H_ + +#include +#include +#include + +#include "./aom_config.h" +#if CONFIG_BITSTREAM_DEBUG +#include +#include +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "aom_ports/mem.h" +#include "aom/aomdx.h" +#include "aom/aom_integer.h" +#include "aom_dsp/prob.h" +#if CONFIG_ACCOUNTING +#include "av1/decoder/accounting.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef size_t BD_VALUE; + +#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT) + +// This is meant to be a large, positive constant that can still be efficiently +// loaded as an immediate (on platforms like ARM, for example). +// Even relatively modest values like 100 would work fine. +#define LOTS_OF_BITS 0x40000000 + +struct aom_dk_reader { + // Be careful when reordering this struct, it may impact the cache negatively. + BD_VALUE value; + unsigned int range; + int count; + const uint8_t *buffer_start; + const uint8_t *buffer_end; + const uint8_t *buffer; + aom_decrypt_cb decrypt_cb; + void *decrypt_state; + uint8_t clear_buffer[sizeof(BD_VALUE) + 1]; +#if CONFIG_ACCOUNTING + Accounting *accounting; +#endif +}; + +int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer, + size_t size, aom_decrypt_cb decrypt_cb, + void *decrypt_state); + +void aom_dk_reader_fill(struct aom_dk_reader *r); + +const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r); + +static INLINE uint32_t aom_dk_reader_tell(const struct aom_dk_reader *r) { + const uint32_t bits_read = + (uint32_t)((r->buffer - r->buffer_start) * CHAR_BIT); + const int count = + (r->count < LOTS_OF_BITS) ? r->count : r->count - LOTS_OF_BITS; + assert(r->buffer >= r->buffer_start); + return bits_read - (count + CHAR_BIT); +} + +/*The resolution of fractional-precision bit usage measurements, i.e., + 3 => 1/8th bits.*/ +#define DK_BITRES (3) + +static INLINE uint32_t aom_dk_reader_tell_frac(const struct aom_dk_reader *r) { + uint32_t num_bits; + uint32_t range; + int l; + int i; + num_bits = aom_dk_reader_tell(r) << DK_BITRES; + range = r->range; + l = 0; + for (i = DK_BITRES; i-- > 0;) { + int b; + range = range * range >> 7; + b = (int)(range >> 8); + l = l << 1 | b; + range >>= b; + } + return num_bits - l; +} + +static INLINE int aom_dk_reader_has_error(struct aom_dk_reader *r) { + // Check if we have reached the end of the buffer. + // + // Variable 'count' stores the number of bits in the 'value' buffer, minus + // 8. The top byte is part of the algorithm, and the remainder is buffered + // to be shifted into it. So if count == 8, the top 16 bits of 'value' are + // occupied, 8 for the algorithm and 8 in the buffer. + // + // When reading a byte from the user's buffer, count is filled with 8 and + // one byte is filled into the value buffer. When we reach the end of the + // data, count is additionally filled with LOTS_OF_BITS. So when + // count == LOTS_OF_BITS - 1, the user's data has been exhausted. + // + // 1 if we have tried to decode bits after the end of stream was encountered. + // 0 No error. + return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS; +} + +static INLINE int aom_dk_read(struct aom_dk_reader *r, int prob) { + unsigned int bit = 0; + BD_VALUE value; + BD_VALUE bigsplit; + int count; + unsigned int range; + unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT; + + if (r->count < 0) aom_dk_reader_fill(r); + + value = r->value; + count = r->count; + + bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT); + + range = split; + + if (value >= bigsplit) { + range = r->range - split; + value = value - bigsplit; + bit = 1; + } + + { + register int shift = aom_norm[range]; + range <<= shift; + value <<= shift; + count -= shift; + } + r->value = value; + r->count = count; + r->range = range; + +#if CONFIG_BITSTREAM_DEBUG + { + int ref_bit, ref_prob; + const int queue_r = bitstream_queue_get_read(); + const int frame_idx = bitstream_queue_get_frame_read(); + bitstream_queue_pop(&ref_bit, &ref_prob); + if (prob != ref_prob) { + fprintf( + stderr, + "\n *** prob error, frame_idx_r %d prob %d ref_prob %d queue_r %d\n", + frame_idx, prob, ref_prob, queue_r); + assert(0); + } + if ((int)bit != ref_bit) { + fprintf(stderr, "\n *** bit error, frame_idx_r %d bit %d ref_bit %d\n", + frame_idx, bit, ref_bit); + assert(0); + } + } +#endif // CONFIG_BITSTREAM_DEBUG + + return bit; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_DKBOOLREADER_H_ diff --git a/third_party/aom/aom_dsp/dkboolwriter.c b/third_party/aom/aom_dsp/dkboolwriter.c new file mode 100644 index 000000000..fc98e7c9b --- /dev/null +++ b/third_party/aom/aom_dsp/dkboolwriter.c @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./dkboolwriter.h" + +static INLINE void aom_dk_write_bit(aom_dk_writer *w, int bit) { + aom_dk_write(w, bit, 128); // aom_prob_half +} + +void aom_dk_start_encode(aom_dk_writer *br, uint8_t *source) { + br->lowvalue = 0; + br->range = 255; + br->count = -24; + br->buffer = source; + br->pos = 0; + aom_dk_write_bit(br, 0); +} + +void aom_dk_stop_encode(aom_dk_writer *br) { + int i; + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_set_skip_write(1); +#endif // CONFIG_BITSTREAM_DEBUG + + for (i = 0; i < 32; i++) aom_dk_write_bit(br, 0); + +#if CONFIG_BITSTREAM_DEBUG + bitstream_queue_set_skip_write(0); +#endif // CONFIG_BITSTREAM_DEBUG + + // Ensure there's no ambigous collision with any index marker bytes + if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0; +} diff --git a/third_party/aom/aom_dsp/dkboolwriter.h b/third_party/aom/aom_dsp/dkboolwriter.h new file mode 100644 index 000000000..835436885 --- /dev/null +++ b/third_party/aom/aom_dsp/dkboolwriter.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_DKBOOLWRITER_H_ +#define AOM_DSP_DKBOOLWRITER_H_ + +#include "./aom_config.h" + +#if CONFIG_BITSTREAM_DEBUG +#include +#include "aom_util/debug_util.h" +#endif // CONFIG_BITSTREAM_DEBUG + +#include "aom_dsp/prob.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct aom_dk_writer { + unsigned int lowvalue; + unsigned int range; + int count; + unsigned int pos; + uint8_t *buffer; +} aom_dk_writer; + +void aom_dk_start_encode(aom_dk_writer *bc, uint8_t *buffer); +void aom_dk_stop_encode(aom_dk_writer *bc); + +static INLINE void aom_dk_write(aom_dk_writer *br, int bit, int probability) { + unsigned int split; + int count = br->count; + unsigned int range = br->range; + unsigned int lowvalue = br->lowvalue; + register int shift; + +#if CONFIG_BITSTREAM_DEBUG + // int queue_r = 0; + // int frame_idx_r = 0; + // int queue_w = bitstream_queue_get_write(); + // int frame_idx_w = bitstream_queue_get_frame_write(); + // if (frame_idx_w == frame_idx_r && queue_w == queue_r) { + // fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", + // frame_idx_w, queue_w); + // } + bitstream_queue_push(bit, probability); +#endif // CONFIG_BITSTREAM_DEBUG + + split = 1 + (((range - 1) * probability) >> 8); + + range = split; + + if (bit) { + lowvalue += split; + range = br->range - split; + } + + shift = aom_norm[range]; + + range <<= shift; + count += shift; + + if (count >= 0) { + int offset = shift - count; + + if ((lowvalue << (offset - 1)) & 0x80000000) { + int x = br->pos - 1; + + while (x >= 0 && br->buffer[x] == 0xff) { + br->buffer[x] = 0; + x--; + } + + br->buffer[x] += 1; + } + + br->buffer[br->pos++] = (lowvalue >> (24 - offset)); + lowvalue <<= offset; + shift = count; + lowvalue &= 0xffffff; + count -= 8; + } + + lowvalue <<= shift; + br->count = count; + br->lowvalue = lowvalue; + br->range = range; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_DKBOOLWRITER_H_ diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c new file mode 100644 index 000000000..ad76b7e3e --- /dev/null +++ b/third_party/aom/aom_dsp/entcode.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifdef HAVE_CONFIG_H +#include "./config.h" +#endif + +#include "aom_dsp/entcode.h" + +/*Given the current total integer number of bits used and the current value of + rng, computes the fraction number of bits used to OD_BITRES precision. + This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac(). + nbits_total: The number of whole bits currently used, i.e., the value + returned by od_ec_enc_tell() or od_ec_dec_tell(). + rng: The current value of rng from either the encoder or decoder state. + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) { + uint32_t nbits; + int l; + int i; + /*To handle the non-integral number of bits still left in the encoder/decoder + state, we compute the worst-case number of bits of val that must be + encoded to ensure that the value is inside the range for any possible + subsequent bits. + The computation here is independent of val itself (the decoder does not + even track that value), even though the real number of bits used after + od_ec_enc_done() may be 1 smaller if rng is a power of two and the + corresponding trailing bits of val are all zeros. + If we did try to track that special case, then coding a value with a + probability of 1/(1 << n) might sometimes appear to use more than n bits. + This may help explain the surprising result that a newly initialized + encoder or decoder claims to have used 1 bit.*/ + nbits = nbits_total << OD_BITRES; + l = 0; + for (i = OD_BITRES; i-- > 0;) { + int b; + rng = rng * rng >> 15; + b = (int)(rng >> 16); + l = l << 1 | b; + rng >>= b; + } + return nbits - l; +} diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h new file mode 100644 index 000000000..534959e66 --- /dev/null +++ b/third_party/aom/aom_dsp/entcode.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if !defined(_entcode_H) +#define _entcode_H (1) +#include +#include +#include "av1/common/odintrin.h" + +/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic + on a larger type, you can speed up the decoder by using it here.*/ +typedef uint32_t od_ec_window; + +#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT) + +/*The number of bits to use for the range-coded part of unsigned integers.*/ +#define OD_EC_UINT_BITS (4) + +/*The resolution of fractional-precision bit usage measurements, i.e., + 3 => 1/8th bits.*/ +#define OD_BITRES (3) + +/*With CONFIG_EC_SMALLMUL, the value stored in a CDF is 32768 minus the actual + Q15 cumulative probability (an "inverse" CDF). + This function converts from one representation to the other (and is its own + inverse).*/ +#if CONFIG_EC_SMALLMUL +#define OD_ICDF(x) (32768U - (x)) +#else +#define OD_ICDF(x) (x) +#endif + +/*See entcode.c for further documentation.*/ + +OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total, + uint32_t rng); + +#endif diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c new file mode 100644 index 000000000..49b176cd8 --- /dev/null +++ b/third_party/aom/aom_dsp/entdec.c @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifdef HAVE_CONFIG_H +#include "./config.h" +#endif + +#include "aom_dsp/entdec.h" + +/*A range decoder. + This is an entropy decoder based upon \cite{Mar79}, which is itself a + rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}. + It is very similar to arithmetic encoding, except that encoding is done with + digits in any base, instead of with bits, and so it is faster when using + larger bases (i.e.: a byte). + The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$ + is the base, longer than the theoretical optimum, but to my knowledge there + is no published justification for this claim. + This only seems true when using near-infinite precision arithmetic so that + the process is carried out with no rounding errors. + + An excellent description of implementation details is available at + http://www.arturocampos.com/ac_range.html + A recent work \cite{MNW98} which proposes several changes to arithmetic + encoding for efficiency actually re-discovers many of the principles + behind range encoding, and presents a good theoretical analysis of them. + + End of stream is handled by writing out the smallest number of bits that + ensures that the stream will be correctly decoded regardless of the value of + any subsequent bits. + od_ec_dec_tell() can be used to determine how many bits were needed to decode + all the symbols thus far; other data can be packed in the remaining bits of + the input buffer. + @PHDTHESIS{Pas76, + author="Richard Clark Pasco", + title="Source coding algorithms for fast data compression", + school="Dept. of Electrical Engineering, Stanford University", + address="Stanford, CA", + month=May, + year=1976, + URL="http://www.richpasco.org/scaffdc.pdf" + } + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video & Data Recording Conference", + year=1979, + address="Southampton", + month=Jul, + URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" + }*/ + +/*This is meant to be a large, positive constant that can still be efficiently + loaded as an immediate (on platforms like ARM, for example). + Even relatively modest values like 100 would work fine.*/ +#define OD_EC_LOTS_OF_BITS (0x4000) + +static void od_ec_dec_refill(od_ec_dec *dec) { + int s; + od_ec_window dif; + int16_t cnt; + const unsigned char *bptr; + const unsigned char *end; + dif = dec->dif; + cnt = dec->cnt; + bptr = dec->bptr; + end = dec->end; + s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15); + for (; s >= 0 && bptr < end; s -= 8, bptr++) { + OD_ASSERT(s <= OD_EC_WINDOW_SIZE - 8); + dif ^= (od_ec_window)bptr[0] << s; + cnt += 8; + } + if (bptr >= end) { + dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt; + cnt = OD_EC_LOTS_OF_BITS; + } + dec->dif = dif; + dec->cnt = cnt; + dec->bptr = bptr; +} + +/*Takes updated dif and range values, renormalizes them so that + 32768 <= rng < 65536 (reading more bytes from the stream into dif if + necessary), and stores them back in the decoder context. + dif: The new value of dif. + rng: The new value of the range. + ret: The value to return. + Return: ret. + This allows the compiler to jump to this function via a tail-call.*/ +static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, + int ret) { + int d; + OD_ASSERT(rng <= 65535U); + d = 16 - OD_ILOG_NZ(rng); + dec->cnt -= d; +#if CONFIG_EC_SMALLMUL + /*This is equivalent to shifting in 1's instead of 0's.*/ + dec->dif = ((dif + 1) << d) - 1; +#else + dec->dif = dif << d; +#endif + dec->rng = rng << d; + if (dec->cnt < 0) od_ec_dec_refill(dec); + return ret; +} + +/*Initializes the decoder. + buf: The input buffer to use. + Return: 0 on success, or a negative value on error.*/ +void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, + uint32_t storage) { + dec->buf = buf; + dec->eptr = buf + storage; + dec->end_window = 0; + dec->nend_bits = 0; + dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8); + dec->end = buf + storage; + dec->bptr = buf; +#if CONFIG_EC_SMALLMUL + dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1; +#else + dec->dif = 0; +#endif + dec->rng = 0x8000; + dec->cnt = -15; + dec->error = 0; + od_ec_dec_refill(dec); +} + +/*Decode a single binary value. + {EC_SMALLMUL} f: The probability that the bit is one, scaled by 32768. + {else} f: The probability that the bit is zero, scaled by 32768. + Return: The value decoded (0 or 1).*/ +int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { + od_ec_window dif; + od_ec_window vw; + unsigned r; + unsigned r_new; + unsigned v; + int ret; + OD_ASSERT(0 < f); + OD_ASSERT(f < 32768U); + dif = dec->dif; + r = dec->rng; + OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r); + OD_ASSERT(32768U <= r); +#if CONFIG_EC_SMALLMUL + v = (r >> 8) * (uint32_t)f >> 7; + vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); + ret = 1; + r_new = v; + if (dif >= vw) { + r_new = r - v; + dif -= vw; + ret = 0; + } +#else + v = f * (uint32_t)r >> 15; + vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); + ret = 0; + r_new = v; + if (dif >= vw) { + r_new = r - v; + dif -= vw; + ret = 1; + } +#endif + return od_ec_dec_normalize(dec, dif, r_new, ret); +} + +/*Decodes a symbol given a cumulative distribution function (CDF) table in Q15. + cdf: The CDF, such that symbol s falls in the range + [s > 0 ? cdf[s - 1] : 0, cdf[s]). + The values must be monotonically non-increasing, and cdf[nsyms - 1] + must be 32768. + {EC_SMALLMUL}: The CDF contains 32768 minus those values. + nsyms: The number of symbols in the alphabet. + This should be at most 16. + Return: The decoded symbol s.*/ +int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) { + od_ec_window dif; + unsigned r; + unsigned c; + unsigned u; + unsigned v; + int ret; + (void)nsyms; + dif = dec->dif; + r = dec->rng; + OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r); + OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U)); + OD_ASSERT(32768U <= r); +#if CONFIG_EC_SMALLMUL + c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); + v = r; + ret = -1; + do { + u = v; + v = (r >> 8) * (uint32_t)cdf[++ret] >> 7; + } while (c < v); + OD_ASSERT(v < u); + OD_ASSERT(u <= r); + r = u - v; + dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); +#else + c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); + v = 0; + ret = -1; + do { + u = v; + v = cdf[++ret] * (uint32_t)r >> 15; + } while (v <= c); + OD_ASSERT(u < v); + OD_ASSERT(v <= r); + r = v - u; + dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16); +#endif + return od_ec_dec_normalize(dec, dif, r, ret); +} + +#if CONFIG_RAWBITS +/*Extracts a sequence of raw bits from the stream. + The bits must have been encoded with od_ec_enc_bits(). + ftb: The number of bits to extract. + This must be between 0 and 25, inclusive. + Return: The decoded bits.*/ +uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) { + od_ec_window window; + int available; + uint32_t ret; + OD_ASSERT(ftb <= 25); + window = dec->end_window; + available = dec->nend_bits; + if ((unsigned)available < ftb) { + const unsigned char *buf; + const unsigned char *eptr; + buf = dec->buf; + eptr = dec->eptr; + OD_ASSERT(available <= OD_EC_WINDOW_SIZE - 8); + do { + if (eptr <= buf) { + dec->tell_offs += OD_EC_LOTS_OF_BITS - available; + available = OD_EC_LOTS_OF_BITS; + break; + } + window |= (od_ec_window) * --eptr << available; + available += 8; + } while (available <= OD_EC_WINDOW_SIZE - 8); + dec->eptr = eptr; + } + ret = (uint32_t)window & (((uint32_t)1 << ftb) - 1); + window >>= ftb; + available -= ftb; + dec->end_window = window; + dec->nend_bits = available; + return ret; +} +#endif + +/*Returns the number of bits "used" by the decoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Return: The number of bits. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +int od_ec_dec_tell(const od_ec_dec *dec) { + return (int)(((dec->end - dec->eptr) + (dec->bptr - dec->buf)) * 8 - + dec->cnt - dec->nend_bits + dec->tell_offs); +} + +/*Returns the number of bits "used" by the decoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) { + return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng); +} diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h new file mode 100644 index 000000000..e1145e81d --- /dev/null +++ b/third_party/aom/aom_dsp/entdec.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if !defined(_entdec_H) +#define _entdec_H (1) +#include +#include "aom_dsp/entcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct od_ec_dec od_ec_dec; + +#if defined(OD_ACCOUNTING) && OD_ACCOUNTING +#define OD_ACC_STR , char *acc_str +#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str) +#else +#define OD_ACC_STR +#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb) +#endif + +/*The entropy decoder context.*/ +struct od_ec_dec { + /*The start of the current input buffer.*/ + const unsigned char *buf; + /*The read pointer for the raw bits.*/ + const unsigned char *eptr; + /*Bits that will be read from/written at the end.*/ + od_ec_window end_window; + /*Number of valid bits in end_window.*/ + int nend_bits; + /*An offset used to keep track of tell after reaching the end of the stream. + This is constant throughout most of the decoding process, but becomes + important once we hit the end of the buffer and stop incrementing pointers + (and instead pretend cnt/nend_bits have lots of bits).*/ + int32_t tell_offs; + /*The end of the current input buffer.*/ + const unsigned char *end; + /*The read pointer for the entropy-coded bits.*/ + const unsigned char *bptr; + /*The difference between the coded value and the low end of the current + range. + {EC_SMALLMUL} The difference between the high end of the current range, + (low + rng), and the coded value, minus 1. + This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the + decoder only uses the top 16 bits of the window to decode the next symbol. + As we shift up during renormalization, if we don't have enough bits left in + the window to fill the top 16, we'll read in more bits of the coded + value.*/ + od_ec_window dif; + /*The number of values in the current range.*/ + uint16_t rng; + /*The number of bits of data in the current value.*/ + int16_t cnt; + /*Nonzero if an error occurred.*/ + int error; +}; + +/*See entdec.c for further documentation.*/ + +void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec, + const uint16_t *cdf, int nsyms) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) + OD_ARG_NONNULL(1); + +OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) + OD_ARG_NONNULL(1); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c new file mode 100644 index 000000000..a350f27f4 --- /dev/null +++ b/third_party/aom/aom_dsp/entenc.c @@ -0,0 +1,507 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifdef HAVE_CONFIG_H +#include "./config.h" +#endif + +#include +#include +#include "aom_dsp/entenc.h" + +/*A range encoder. + See entdec.c and the references for implementation details \cite{Mar79,MNW98}. + + @INPROCEEDINGS{Mar79, + author="Martin, G.N.N.", + title="Range encoding: an algorithm for removing redundancy from a digitised + message", + booktitle="Video \& Data Recording Conference", + year=1979, + address="Southampton", + month=Jul, + URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz" + } + @ARTICLE{MNW98, + author="Alistair Moffat and Radford Neal and Ian H. Witten", + title="Arithmetic Coding Revisited", + journal="{ACM} Transactions on Information Systems", + year=1998, + volume=16, + number=3, + pages="256--294", + month=Jul, + URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf" + }*/ + +/*Takes updated low and range values, renormalizes them so that + 32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if + necessary), and stores them back in the encoder context. + low: The new value of low. + rng: The new value of the range.*/ +static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low, + unsigned rng) { + int d; + int c; + int s; + c = enc->cnt; + OD_ASSERT(rng <= 65535U); + d = 16 - OD_ILOG_NZ(rng); + s = c + d; + /*TODO: Right now we flush every time we have at least one byte available. + Instead we should use an od_ec_window and flush right before we're about to + shift bits off the end of the window. + For a 32-bit window this is about the same amount of work, but for a 64-bit + window it should be a fair win.*/ + if (s >= 0) { + uint16_t *buf; + uint32_t storage; + uint32_t offs; + unsigned m; + buf = enc->precarry_buf; + storage = enc->precarry_storage; + offs = enc->offs; + if (offs + 2 > storage) { + storage = 2 * storage + 2; + buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage); + if (buf == NULL) { + enc->error = -1; + enc->offs = 0; + return; + } + enc->precarry_buf = buf; + enc->precarry_storage = storage; + } + c += 16; + m = (1 << c) - 1; + if (s >= 8) { + OD_ASSERT(offs < storage); + buf[offs++] = (uint16_t)(low >> c); + low &= m; + c -= 8; + m >>= 8; + } + OD_ASSERT(offs < storage); + buf[offs++] = (uint16_t)(low >> c); + s = c + d - 24; + low &= m; + enc->offs = offs; + } + enc->low = low << d; + enc->rng = rng << d; + enc->cnt = s; +} + +/*Initializes the encoder. + size: The initial size of the buffer, in bytes.*/ +void od_ec_enc_init(od_ec_enc *enc, uint32_t size) { + od_ec_enc_reset(enc); + enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size); + enc->storage = size; + if (size > 0 && enc->buf == NULL) { + enc->storage = 0; + enc->error = -1; + } + enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size); + enc->precarry_storage = size; + if (size > 0 && enc->precarry_buf == NULL) { + enc->precarry_storage = 0; + enc->error = -1; + } +} + +/*Reinitializes the encoder.*/ +void od_ec_enc_reset(od_ec_enc *enc) { + enc->end_offs = 0; + enc->end_window = 0; + enc->nend_bits = 0; + enc->offs = 0; + enc->low = 0; + enc->rng = 0x8000; + /*This is initialized to -9 so that it crosses zero after we've accumulated + one byte + one carry bit.*/ + enc->cnt = -9; + enc->error = 0; +#if OD_MEASURE_EC_OVERHEAD + enc->entropy = 0; + enc->nb_symbols = 0; +#endif +} + +/*Frees the buffers used by the encoder.*/ +void od_ec_enc_clear(od_ec_enc *enc) { + free(enc->precarry_buf); + free(enc->buf); +} + +/*Encodes a symbol given its frequency in Q15. + fl: The cumulative frequency of all symbols that come before the one to be + encoded. + fh: The cumulative frequency of all symbols up to and including the one to + be encoded. + {EC_SMALLMUL} Both values are 32768 minus that.*/ +static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { + od_ec_window l; + unsigned r; + unsigned u; + unsigned v; + l = enc->low; + r = enc->rng; + OD_ASSERT(32768U <= r); +#if CONFIG_EC_SMALLMUL + OD_ASSERT(fh < fl); + OD_ASSERT(fl <= 32768U); + if (fl < 32768U) { + u = (r >> 8) * (uint32_t)fl >> 7; + v = (r >> 8) * (uint32_t)fh >> 7; + l += r - u; + r = u - v; + } else { + r -= (r >> 8) * (uint32_t)fh >> 7; + } +#else + OD_ASSERT(fl < fh); + OD_ASSERT(fh <= 32768U); + u = fl * (uint32_t)r >> 15; + v = fh * (uint32_t)r >> 15; + r = v - u; + l += u; +#endif + od_ec_enc_normalize(enc, l, r); +#if OD_MEASURE_EC_OVERHEAD + enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / 32768.); + enc->nb_symbols++; +#endif +} + +/*Encode a single binary value. + val: The value to encode (0 or 1). + {EC_SMALLMUL} f: The probability that the val is one, scaled by 32768. + {else} f: The probability that val is zero, scaled by 32768.*/ +void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { + od_ec_window l; + unsigned r; + unsigned v; + OD_ASSERT(0 < f); + OD_ASSERT(f < 32768U); + l = enc->low; + r = enc->rng; + OD_ASSERT(32768U <= r); +#if CONFIG_EC_SMALLMUL + v = (r >> 8) * (uint32_t)f >> 7; + if (val) l += r - v; + r = val ? v : r - v; +#else + v = f * (uint32_t)r >> 15; + if (val) l += v; + r = val ? r - v : v; +#endif + od_ec_enc_normalize(enc, l, r); +#if OD_MEASURE_EC_OVERHEAD + enc->entropy -= + OD_LOG2((double)(val ? 32768 - OD_ICDF(f) : OD_ICDF(f)) / 32768.); + enc->nb_symbols++; +#endif +} + +/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15. + s: The index of the symbol to encode. + cdf: The CDF, such that symbol s falls in the range + [s > 0 ? cdf[s - 1] : 0, cdf[s]). + The values must be monotonically non-decreasing, and the last value + must be exactly 32768. + nsyms: The number of symbols in the alphabet. + This should be at most 16.*/ +void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, + int nsyms) { + (void)nsyms; + OD_ASSERT(s >= 0); + OD_ASSERT(s < nsyms); + OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U)); + od_ec_encode_q15(enc, s > 0 ? cdf[s - 1] : OD_ICDF(0), cdf[s]); +} + +#if CONFIG_RAWBITS +/*Encodes a sequence of raw bits in the stream. + fl: The bits to encode. + ftb: The number of bits to encode. + This must be between 0 and 25, inclusive.*/ +void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) { + od_ec_window end_window; + int nend_bits; + OD_ASSERT(ftb <= 25); + OD_ASSERT(fl < (uint32_t)1 << ftb); +#if OD_MEASURE_EC_OVERHEAD + enc->entropy += ftb; +#endif + end_window = enc->end_window; + nend_bits = enc->nend_bits; + if (nend_bits + ftb > OD_EC_WINDOW_SIZE) { + unsigned char *buf; + uint32_t storage; + uint32_t end_offs; + buf = enc->buf; + storage = enc->storage; + end_offs = enc->end_offs; + if (end_offs + (OD_EC_WINDOW_SIZE >> 3) >= storage) { + unsigned char *new_buf; + uint32_t new_storage; + new_storage = 2 * storage + (OD_EC_WINDOW_SIZE >> 3); + new_buf = (unsigned char *)malloc(sizeof(*new_buf) * new_storage); + if (new_buf == NULL) { + enc->error = -1; + enc->end_offs = 0; + return; + } + OD_COPY(new_buf + new_storage - end_offs, buf + storage - end_offs, + end_offs); + storage = new_storage; + free(buf); + enc->buf = buf = new_buf; + enc->storage = storage; + } + do { + OD_ASSERT(end_offs < storage); + buf[storage - ++end_offs] = (unsigned char)end_window; + end_window >>= 8; + nend_bits -= 8; + } while (nend_bits >= 8); + enc->end_offs = end_offs; + } + OD_ASSERT(nend_bits + ftb <= OD_EC_WINDOW_SIZE); + end_window |= (od_ec_window)fl << nend_bits; + nend_bits += ftb; + enc->end_window = end_window; + enc->nend_bits = nend_bits; +} +#endif + +/*Overwrites a few bits at the very start of an existing stream, after they + have already been encoded. + This makes it possible to have a few flags up front, where it is easy for + decoders to access them without parsing the whole stream, even if their + values are not determined until late in the encoding process, without having + to buffer all the intermediate symbols in the encoder. + In order for this to work, at least nbits bits must have already been encoded + using probabilities that are an exact power of two. + The encoder can verify the number of encoded bits is sufficient, but cannot + check this latter condition. + val: The bits to encode (in the least nbits significant bits). + They will be decoded in order from most-significant to least. + nbits: The number of bits to overwrite. + This must be no more than 8.*/ +void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) { + int shift; + unsigned mask; + OD_ASSERT(nbits >= 0); + OD_ASSERT(nbits <= 8); + OD_ASSERT(val < 1U << nbits); + shift = 8 - nbits; + mask = ((1U << nbits) - 1) << shift; + if (enc->offs > 0) { + /*The first byte has been finalized.*/ + enc->precarry_buf[0] = + (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift); + } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) { + /*The first byte has yet to be output.*/ + enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) | + (od_ec_window)val << (16 + enc->cnt + shift); + } else { + /*The encoder hasn't even encoded _nbits of data yet.*/ + enc->error = -1; + } +} + +#if OD_MEASURE_EC_OVERHEAD +#include +#endif + +/*Indicates that there are no more symbols to encode. + All remaining output bytes are flushed to the output buffer. + od_ec_enc_reset() should be called before using the encoder again. + bytes: Returns the size of the encoded data in the returned buffer. + Return: A pointer to the start of the final buffer, or NULL if there was an + encoding error.*/ +unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { + unsigned char *out; + uint32_t storage; + uint16_t *buf; + uint32_t offs; + uint32_t end_offs; + int nend_bits; + od_ec_window m; + od_ec_window e; + od_ec_window l; + unsigned r; + int c; + int s; + if (enc->error) return NULL; +#if OD_MEASURE_EC_OVERHEAD + { + uint32_t tell; + /* Don't count the 1 bit we lose to raw bits as overhead. */ + tell = od_ec_enc_tell(enc) - 1; + fprintf(stderr, "overhead: %f%%\n", + 100 * (tell - enc->entropy) / enc->entropy); + fprintf(stderr, "efficiency: %f bits/symbol\n", + (double)tell / enc->nb_symbols); + } +#endif + /*We output the minimum number of bits that ensures that the symbols encoded + thus far will be decoded correctly regardless of the bits that follow.*/ + l = enc->low; + r = enc->rng; + c = enc->cnt; + s = 9; + m = 0x7FFF; + e = (l + m) & ~m; + while ((e | m) >= l + r) { + s++; + m >>= 1; + e = (l + m) & ~m; + } + s += c; + offs = enc->offs; + buf = enc->precarry_buf; + if (s > 0) { + unsigned n; + storage = enc->precarry_storage; + if (offs + ((s + 7) >> 3) > storage) { + storage = storage * 2 + ((s + 7) >> 3); + buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage); + if (buf == NULL) { + enc->error = -1; + return NULL; + } + enc->precarry_buf = buf; + enc->precarry_storage = storage; + } + n = (1 << (c + 16)) - 1; + do { + OD_ASSERT(offs < storage); + buf[offs++] = (uint16_t)(e >> (c + 16)); + e &= n; + s -= 8; + c -= 8; + n >>= 8; + } while (s > 0); + } + /*Make sure there's enough room for the entropy-coded bits and the raw + bits.*/ + out = enc->buf; + storage = enc->storage; + end_offs = enc->end_offs; + e = enc->end_window; + nend_bits = enc->nend_bits; + s = -s; + c = OD_MAXI((nend_bits - s + 7) >> 3, 0); + if (offs + end_offs + c > storage) { + storage = offs + end_offs + c; + out = (unsigned char *)realloc(out, sizeof(*out) * storage); + if (out == NULL) { + enc->error = -1; + return NULL; + } + OD_MOVE(out + storage - end_offs, out + enc->storage - end_offs, end_offs); + enc->buf = out; + enc->storage = storage; + } + /*If we have buffered raw bits, flush them as well.*/ + while (nend_bits > s) { + OD_ASSERT(end_offs < storage); + out[storage - ++end_offs] = (unsigned char)e; + e >>= 8; + nend_bits -= 8; + } + *nbytes = offs + end_offs; + /*Perform carry propagation.*/ + OD_ASSERT(offs + end_offs <= storage); + out = out + storage - (offs + end_offs); + c = 0; + end_offs = offs; + while (offs > 0) { + offs--; + c = buf[offs] + c; + out[offs] = (unsigned char)c; + c >>= 8; + } + /*Add any remaining raw bits to the last byte. + There is guaranteed to be enough room, because nend_bits <= s.*/ + OD_ASSERT(nend_bits <= 0 || end_offs > 0); + if (nend_bits > 0) out[end_offs - 1] |= (unsigned char)e; + /*Note: Unless there's an allocation error, if you keep encoding into the + current buffer and call this function again later, everything will work + just fine (you won't get a new packet out, but you will get a single + buffer with the new data appended to the old). + However, this function is O(N) where N is the amount of data coded so far, + so calling it more than once for a given packet is a bad idea.*/ + return out; +} + +/*Returns the number of bits "used" by the encoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Warning: The value returned by this function can decrease compared to an + earlier call, even after encoding more data, if there is an encoding error + (i.e., a failure to allocate enough space for the output buffer). + Return: The number of bits. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +int od_ec_enc_tell(const od_ec_enc *enc) { + /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra + bit, which we reserve for terminating the stream.*/ + return (enc->offs + enc->end_offs) * 8 + enc->cnt + enc->nend_bits + 10; +} + +/*Returns the number of bits "used" by the encoded symbols so far. + This same number can be computed in either the encoder or the decoder, and is + suitable for making coding decisions. + Warning: The value returned by this function can decrease compared to an + earlier call, even after encoding more data, if there is an encoding error + (i.e., a failure to allocate enough space for the output buffer). + Return: The number of bits scaled by 2**OD_BITRES. + This will always be slightly larger than the exact value (e.g., all + rounding error is in the positive direction).*/ +uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) { + return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng); +} + +/*Saves a entropy coder checkpoint to dst. + This allows an encoder to reverse a series of entropy coder + decisions if it decides that the information would have been + better coded some other way.*/ +void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) { + OD_COPY(dst, src, 1); +} + +/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint. + This can only be used to restore from checkpoints earlier in the target + state's history: you can not switch backwards and forwards or otherwise + switch to a state which isn't a casual ancestor of the current state. + Restore is also incompatible with patching the initial bits, as the + changes will remain in the restored version.*/ +void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) { + unsigned char *buf; + uint32_t storage; + uint16_t *precarry_buf; + uint32_t precarry_storage; + OD_ASSERT(dst->storage >= src->storage); + OD_ASSERT(dst->precarry_storage >= src->precarry_storage); + buf = dst->buf; + storage = dst->storage; + precarry_buf = dst->precarry_buf; + precarry_storage = dst->precarry_storage; + OD_COPY(dst, src, 1); + dst->buf = buf; + dst->storage = storage; + dst->precarry_buf = precarry_buf; + dst->precarry_storage = precarry_storage; +} diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h new file mode 100644 index 000000000..314b36318 --- /dev/null +++ b/third_party/aom/aom_dsp/entenc.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#if !defined(_entenc_H) +#define _entenc_H (1) +#include +#include "aom_dsp/entcode.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct od_ec_enc od_ec_enc; + +#define OD_MEASURE_EC_OVERHEAD (0) + +/*The entropy encoder context.*/ +struct od_ec_enc { + /*Buffered output. + This contains only the raw bits until the final call to od_ec_enc_done(), + where all the arithmetic-coded data gets prepended to it.*/ + unsigned char *buf; + /*The size of the buffer.*/ + uint32_t storage; + /*The offset at which the last byte containing raw bits was written.*/ + uint32_t end_offs; + /*Bits that will be read from/written at the end.*/ + od_ec_window end_window; + /*Number of valid bits in end_window.*/ + int nend_bits; + /*A buffer for output bytes with their associated carry flags.*/ + uint16_t *precarry_buf; + /*The size of the pre-carry buffer.*/ + uint32_t precarry_storage; + /*The offset at which the next entropy-coded byte will be written.*/ + uint32_t offs; + /*The low end of the current range.*/ + od_ec_window low; + /*The number of values in the current range.*/ + uint16_t rng; + /*The number of bits of data in the current value.*/ + int16_t cnt; + /*Nonzero if an error occurred.*/ + int error; +#if OD_MEASURE_EC_OVERHEAD + double entropy; + int nb_symbols; +#endif +}; + +/*See entenc.c for further documentation.*/ + +void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1); +void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1); +void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1); + +void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15) + OD_ARG_NONNULL(1); +void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(3); + +void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) + OD_ARG_NONNULL(1); + +void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc, + uint32_t *nbytes) + OD_ARG_NONNULL(1) OD_ARG_NONNULL(2); + +OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc) + OD_ARG_NONNULL(1); +OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) + OD_ARG_NONNULL(1); + +void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src); +void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c new file mode 100644 index 000000000..09d945afc --- /dev/null +++ b/third_party/aom/aom_dsp/fastssim.c @@ -0,0 +1,493 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * This code was originally written by: Nathan E. Egge, at the Daala + * project. + */ +#include +#include +#include +#include +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/ssim.h" +#include "aom_ports/system_state.h" + +typedef struct fs_level fs_level; +typedef struct fs_ctx fs_ctx; + +#define SSIM_C1 (255 * 255 * 0.01 * 0.01) +#define SSIM_C2 (255 * 255 * 0.03 * 0.03) +#if CONFIG_HIGHBITDEPTH +#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01) +#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01) +#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03) +#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03) +#endif +#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b)) +#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b)) + +struct fs_level { + uint32_t *im1; + uint32_t *im2; + double *ssim; + int w; + int h; +}; + +struct fs_ctx { + fs_level *level; + int nlevels; + unsigned *col_buf; +}; + +static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) { + unsigned char *data; + size_t data_size; + int lw; + int lh; + int l; + lw = (_w + 1) >> 1; + lh = (_h + 1) >> 1; + data_size = + _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf); + for (l = 0; l < _nlevels; l++) { + size_t im_size; + size_t level_size; + im_size = lw * (size_t)lh; + level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); + level_size += sizeof(*_ctx->level[l].ssim) - 1; + level_size /= sizeof(*_ctx->level[l].ssim); + level_size += im_size; + level_size *= sizeof(*_ctx->level[l].ssim); + data_size += level_size; + lw = (lw + 1) >> 1; + lh = (lh + 1) >> 1; + } + data = (unsigned char *)malloc(data_size); + _ctx->level = (fs_level *)data; + _ctx->nlevels = _nlevels; + data += _nlevels * sizeof(*_ctx->level); + lw = (_w + 1) >> 1; + lh = (_h + 1) >> 1; + for (l = 0; l < _nlevels; l++) { + size_t im_size; + size_t level_size; + _ctx->level[l].w = lw; + _ctx->level[l].h = lh; + im_size = lw * (size_t)lh; + level_size = 2 * im_size * sizeof(*_ctx->level[l].im1); + level_size += sizeof(*_ctx->level[l].ssim) - 1; + level_size /= sizeof(*_ctx->level[l].ssim); + level_size *= sizeof(*_ctx->level[l].ssim); + _ctx->level[l].im1 = (uint32_t *)data; + _ctx->level[l].im2 = _ctx->level[l].im1 + im_size; + data += level_size; + _ctx->level[l].ssim = (double *)data; + data += im_size * sizeof(*_ctx->level[l].ssim); + lw = (lw + 1) >> 1; + lh = (lh + 1) >> 1; + } + _ctx->col_buf = (unsigned *)data; +} + +static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); } + +static void fs_downsample_level(fs_ctx *_ctx, int _l) { + const uint32_t *src1; + const uint32_t *src2; + uint32_t *dst1; + uint32_t *dst2; + int w2; + int h2; + int w; + int h; + int i; + int j; + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + dst1 = _ctx->level[_l].im1; + dst2 = _ctx->level[_l].im2; + w2 = _ctx->level[_l - 1].w; + h2 = _ctx->level[_l - 1].h; + src1 = _ctx->level[_l - 1].im1; + src2 = _ctx->level[_l - 1].im2; + for (j = 0; j < h; j++) { + int j0offs; + int j1offs; + j0offs = 2 * j * w2; + j1offs = FS_MINI(2 * j + 1, h2) * w2; + for (i = 0; i < w; i++) { + int i0; + int i1; + i0 = 2 * i; + i1 = FS_MINI(i0 + 1, w2); + dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] + + src1[j1offs + i0] + src1[j1offs + i1]; + dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] + + src2[j1offs + i0] + src2[j1offs + i1]; + } + } +} + +static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, + int _s1ystride, const uint8_t *_src2, + int _s2ystride, int _w, int _h, uint32_t bd, + uint32_t shift) { + uint32_t *dst1; + uint32_t *dst2; + int w; + int h; + int i; + int j; + w = _ctx->level[0].w; + h = _ctx->level[0].h; + dst1 = _ctx->level[0].im1; + dst2 = _ctx->level[0].im2; + for (j = 0; j < h; j++) { + int j0; + int j1; + j0 = 2 * j; + j1 = FS_MINI(j0 + 1, _h); + for (i = 0; i < w; i++) { + int i0; + int i1; + i0 = 2 * i; + i1 = FS_MINI(i0 + 1, _w); + if (bd == 8 && shift == 0) { + dst1[j * w + i] = + _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] + + _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1]; + dst2[j * w + i] = + _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] + + _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1]; + } else { + uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1); + uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2); + dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) + + (src1s[j0 * _s1ystride + i1] >> shift) + + (src1s[j1 * _s1ystride + i0] >> shift) + + (src1s[j1 * _s1ystride + i1] >> shift); + dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) + + (src2s[j0 * _s2ystride + i1] >> shift) + + (src2s[j1 * _s2ystride + i0] >> shift) + + (src2s[j1 * _s2ystride + i1] >> shift); + } + } + } +} + +static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { + unsigned *col_sums_x; + unsigned *col_sums_y; + uint32_t *im1; + uint32_t *im2; + double *ssim; + double c1; + int w; + int h; + int j0offs; + int j1offs; + int i; + int j; + double ssim_c1 = SSIM_C1; +#if CONFIG_HIGHBITDEPTH + if (bit_depth == 10) ssim_c1 = SSIM_C1_10; + if (bit_depth == 12) ssim_c1 = SSIM_C1_12; +#else + assert(bit_depth == 8); + (void)bit_depth; +#endif + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + col_sums_x = _ctx->col_buf; + col_sums_y = col_sums_x + w; + im1 = _ctx->level[_l].im1; + im2 = _ctx->level[_l].im2; + for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i]; + for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i]; + for (j = 1; j < 4; j++) { + j1offs = FS_MINI(j, h - 1) * w; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + } + ssim = _ctx->level[_l].ssim; + c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l)); + for (j = 0; j < h; j++) { + unsigned mux; + unsigned muy; + int i0; + int i1; + mux = 5 * col_sums_x[0]; + muy = 5 * col_sums_y[0]; + for (i = 1; i < 4; i++) { + i1 = FS_MINI(i, w - 1); + mux += col_sums_x[i1]; + muy += col_sums_y[i1]; + } + for (i = 0; i < w; i++) { + ssim[j * w + i] *= (2 * mux * (double)muy + c1) / + (mux * (double)mux + muy * (double)muy + c1); + if (i + 1 < w) { + i0 = FS_MAXI(0, i - 4); + i1 = FS_MINI(i + 4, w - 1); + mux += col_sums_x[i1] - col_sums_x[i0]; + muy += col_sums_x[i1] - col_sums_x[i0]; + } + } + if (j + 1 < h) { + j0offs = FS_MAXI(0, j - 4) * w; + for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i]; + j1offs = FS_MINI(j + 4, h - 1) * w; + for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i]; + for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i]; + } + } +} + +#define FS_COL_SET(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] = gx * (double)gx; \ + col_sums_gy2[(_col)] = gy * (double)gy; \ + col_sums_gxgy[(_col)] = gx * (double)gy; \ + } while (0) + +#define FS_COL_ADD(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] += gx * (double)gx; \ + col_sums_gy2[(_col)] += gy * (double)gy; \ + col_sums_gxgy[(_col)] += gx * (double)gy; \ + } while (0) + +#define FS_COL_SUB(_col, _joffs, _ioffs) \ + do { \ + unsigned gx; \ + unsigned gy; \ + gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \ + col_sums_gx2[(_col)] -= gx * (double)gx; \ + col_sums_gy2[(_col)] -= gy * (double)gy; \ + col_sums_gxgy[(_col)] -= gx * (double)gy; \ + } while (0) + +#define FS_COL_COPY(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \ + } while (0) + +#define FS_COL_HALVE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \ + } while (0) + +#define FS_COL_DOUBLE(_col1, _col2) \ + do { \ + col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \ + col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \ + col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \ + } while (0) + +static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { + uint32_t *im1; + uint32_t *im2; + unsigned *gx_buf; + unsigned *gy_buf; + double *ssim; + double col_sums_gx2[8]; + double col_sums_gy2[8]; + double col_sums_gxgy[8]; + double c2; + int stride; + int w; + int h; + int i; + int j; + double ssim_c2 = SSIM_C2; +#if CONFIG_HIGHBITDEPTH + if (bit_depth == 10) ssim_c2 = SSIM_C2_10; + if (bit_depth == 12) ssim_c2 = SSIM_C2_12; +#else + assert(bit_depth == 8); + (void)bit_depth; +#endif + + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + im1 = _ctx->level[_l].im1; + im2 = _ctx->level[_l].im2; + ssim = _ctx->level[_l].ssim; + gx_buf = _ctx->col_buf; + stride = w + 8; + gy_buf = gx_buf + 8 * stride; + memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf)); + c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104; + for (j = 0; j < h + 4; j++) { + if (j < h - 1) { + for (i = 0; i < w - 1; i++) { + unsigned g1; + unsigned g2; + unsigned gx; + unsigned gy; + g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]); + g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]); + gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); + g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]); + g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]); + gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2); + gx_buf[(j & 7) * stride + i + 4] = gx; + gy_buf[(j & 7) * stride + i + 4] = gy; + } + } else { + memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf)); + memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf)); + } + if (j >= 4) { + int k; + col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0; + col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0; + col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] = + col_sums_gxgy[0] = 0; + for (i = 4; i < 8; i++) { + FS_COL_SET(i, -1, 0); + FS_COL_ADD(i, 0, 0); + for (k = 1; k < 8 - i; k++) { + FS_COL_DOUBLE(i, i); + FS_COL_ADD(i, -k - 1, 0); + FS_COL_ADD(i, k, 0); + } + } + for (i = 0; i < w; i++) { + double mugx2; + double mugy2; + double mugxgy; + mugx2 = col_sums_gx2[0]; + for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k]; + mugy2 = col_sums_gy2[0]; + for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k]; + mugxgy = col_sums_gxgy[0]; + for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k]; + ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2); + if (i + 1 < w) { + FS_COL_SET(0, -1, 1); + FS_COL_ADD(0, 0, 1); + FS_COL_SUB(2, -3, 2); + FS_COL_SUB(2, 2, 2); + FS_COL_HALVE(1, 2); + FS_COL_SUB(3, -4, 3); + FS_COL_SUB(3, 3, 3); + FS_COL_HALVE(2, 3); + FS_COL_COPY(3, 4); + FS_COL_DOUBLE(4, 5); + FS_COL_ADD(4, -4, 5); + FS_COL_ADD(4, 3, 5); + FS_COL_DOUBLE(5, 6); + FS_COL_ADD(5, -3, 6); + FS_COL_ADD(5, 2, 6); + FS_COL_DOUBLE(6, 7); + FS_COL_ADD(6, -2, 7); + FS_COL_ADD(6, 1, 7); + FS_COL_SET(7, -1, 8); + FS_COL_ADD(7, 0, 8); + } + } + } + } +} + +#define FS_NLEVELS (4) + +/*These weights were derived from the default weights found in Wang's original + Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}. + We drop the finest scale and renormalize the rest to sum to 1.*/ + +static const double FS_WEIGHTS[FS_NLEVELS] = { + 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625 +}; + +static double fs_average(fs_ctx *_ctx, int _l) { + double *ssim; + double ret; + int w; + int h; + int i; + int j; + w = _ctx->level[_l].w; + h = _ctx->level[_l].h; + ssim = _ctx->level[_l].ssim; + ret = 0; + for (j = 0; j < h; j++) + for (i = 0; i < w; i++) ret += ssim[j * w + i]; + return pow(ret / (w * h), FS_WEIGHTS[_l]); +} + +static double convert_ssim_db(double _ssim, double _weight) { + assert(_weight >= _ssim); + if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB; + return 10 * (log10(_weight) - log10(_weight - _ssim)); +} + +static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, + int _dystride, int _w, int _h, uint32_t _bd, + uint32_t _shift) { + fs_ctx ctx; + double ret; + int l; + ret = 1; + fs_ctx_init(&ctx, _w, _h, FS_NLEVELS); + fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd, + _shift); + for (l = 0; l < FS_NLEVELS - 1; l++) { + fs_calc_structure(&ctx, l, _bd); + ret *= fs_average(&ctx, l); + fs_downsample_level(&ctx, l + 1); + } + fs_calc_structure(&ctx, l, _bd); + fs_apply_luminance(&ctx, l, _bd); + ret *= fs_average(&ctx, l); + fs_ctx_clear(&ctx); + return ret; +} + +double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd) { + double ssimv; + uint32_t bd_shift = 0; + aom_clear_system_state(); + assert(bd >= in_bd); + + bd_shift = bd - in_bd; + + *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer, + dest->y_stride, source->y_crop_width, + source->y_crop_height, in_bd, bd_shift); + *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, bd_shift); + *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, bd_shift); + ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v)); + return convert_ssim_db(ssimv, 1.0); +} diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c new file mode 100644 index 000000000..12ee02ba1 --- /dev/null +++ b/third_party/aom/aom_dsp/fwd_txfm.c @@ -0,0 +1,809 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/fwd_txfm.h" +#include +#include "./aom_dsp_rtcd.h" + +void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + int pass; + // We need an intermediate buffer between passes. + tran_low_t intermediate[4 * 4]; + const tran_low_t *in_low = NULL; + tran_low_t *out = intermediate; + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + tran_high_t in_high[4]; // canbe16 + tran_high_t step[4]; // canbe16 + tran_high_t temp1, temp2; // needs32 + int i; + for (i = 0; i < 4; ++i) { + // Load inputs. + if (pass == 0) { + in_high[0] = input[0 * stride] * 16; + in_high[1] = input[1 * stride] * 16; + in_high[2] = input[2 * stride] * 16; + in_high[3] = input[3 * stride] * 16; + if (i == 0 && in_high[0]) { + ++in_high[0]; + } + } else { + assert(in_low != NULL); + in_high[0] = in_low[0 * 4]; + in_high[1] = in_low[1 * 4]; + in_high[2] = in_low[2 * 4]; + in_high[3] = in_low[3 * 4]; + ++in_low; + } + // Transform. + step[0] = in_high[0] + in_high[3]; + step[1] = in_high[1] + in_high[2]; + step[2] = in_high[1] - in_high[2]; + step[3] = in_high[0] - in_high[3]; + temp1 = (step[0] + step[1]) * cospi_16_64; + temp2 = (step[0] - step[1]) * cospi_16_64; + out[0] = (tran_low_t)fdct_round_shift(temp1); + out[2] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; + temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; + out[1] = (tran_low_t)fdct_round_shift(temp1); + out[3] = (tran_low_t)fdct_round_shift(temp2); + // Do next column (which is a transposed row in second/horizontal pass) + ++input; + out += 4; + } + // Setup in/out for next pass. + in_low = intermediate; + out = output; + } + + { + int i, j; + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2; + } + } +} + +void aom_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 4; ++r) + for (c = 0; c < 4; ++c) sum += input[r * stride + c]; + + output[0] = sum << 1; +} + +void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { + int i, j; + tran_low_t intermediate[64]; + int pass; + tran_low_t *output = intermediate; + const tran_low_t *in = NULL; + + // Transform columns + for (pass = 0; pass < 2; ++pass) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + for (i = 0; i < 8; i++) { + // stage 1 + if (pass == 0) { + s0 = (input[0 * stride] + input[7 * stride]) * 4; + s1 = (input[1 * stride] + input[6 * stride]) * 4; + s2 = (input[2 * stride] + input[5 * stride]) * 4; + s3 = (input[3 * stride] + input[4 * stride]) * 4; + s4 = (input[3 * stride] - input[4 * stride]) * 4; + s5 = (input[2 * stride] - input[5 * stride]) * 4; + s6 = (input[1 * stride] - input[6 * stride]) * 4; + s7 = (input[0 * stride] - input[7 * stride]) * 4; + ++input; + } else { + s0 = in[0 * 8] + in[7 * 8]; + s1 = in[1 * 8] + in[6 * 8]; + s2 = in[2 * 8] + in[5 * 8]; + s3 = in[3 * 8] + in[4 * 8]; + s4 = in[3 * 8] - in[4 * 8]; + s5 = in[2 * 8] - in[5 * 8]; + s6 = in[1 * 8] - in[6 * 8]; + s7 = in[0 * 8] - in[7 * 8]; + ++in; + } + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x2 * cospi_24_64 + x3 * cospi_8_64; + t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; + output[0] = (tran_low_t)fdct_round_shift(t0); + output[2] = (tran_low_t)fdct_round_shift(t2); + output[4] = (tran_low_t)fdct_round_shift(t1); + output[6] = (tran_low_t)fdct_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = fdct_round_shift(t0); + t3 = fdct_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + output[1] = (tran_low_t)fdct_round_shift(t0); + output[3] = (tran_low_t)fdct_round_shift(t2); + output[5] = (tran_low_t)fdct_round_shift(t1); + output[7] = (tran_low_t)fdct_round_shift(t3); + output += 8; + } + in = intermediate; + output = final_output; + } + + // Rows + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2; + } +} + +void aom_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + tran_low_t sum = 0; + for (r = 0; r < 8; ++r) + for (c = 0; c < 8; ++c) sum += input[r * stride + c]; + + output[0] = sum; +} + +void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + int pass; + // We need an intermediate buffer between passes. + tran_low_t intermediate[256]; + const tran_low_t *in_low = NULL; + tran_low_t *out = intermediate; + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + tran_high_t step1[8]; // canbe16 + tran_high_t step2[8]; // canbe16 + tran_high_t step3[8]; // canbe16 + tran_high_t in_high[8]; // canbe16 + tran_high_t temp1, temp2; // needs32 + int i; + for (i = 0; i < 16; i++) { + if (0 == pass) { + // Calculate input for the first 8 results. + in_high[0] = (input[0 * stride] + input[15 * stride]) * 4; + in_high[1] = (input[1 * stride] + input[14 * stride]) * 4; + in_high[2] = (input[2 * stride] + input[13 * stride]) * 4; + in_high[3] = (input[3 * stride] + input[12 * stride]) * 4; + in_high[4] = (input[4 * stride] + input[11 * stride]) * 4; + in_high[5] = (input[5 * stride] + input[10 * stride]) * 4; + in_high[6] = (input[6 * stride] + input[9 * stride]) * 4; + in_high[7] = (input[7 * stride] + input[8 * stride]) * 4; + // Calculate input for the next 8 results. + step1[0] = (input[7 * stride] - input[8 * stride]) * 4; + step1[1] = (input[6 * stride] - input[9 * stride]) * 4; + step1[2] = (input[5 * stride] - input[10 * stride]) * 4; + step1[3] = (input[4 * stride] - input[11 * stride]) * 4; + step1[4] = (input[3 * stride] - input[12 * stride]) * 4; + step1[5] = (input[2 * stride] - input[13 * stride]) * 4; + step1[6] = (input[1 * stride] - input[14 * stride]) * 4; + step1[7] = (input[0 * stride] - input[15 * stride]) * 4; + } else { + // Calculate input for the first 8 results. + assert(in_low != NULL); + in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2); + in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2); + in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2); + in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2); + in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2); + in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2); + in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2); + in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2); + // Calculate input for the next 8 results. + step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2); + step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2); + step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2); + step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2); + step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2); + step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2); + step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2); + step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2); + in_low++; + } + // Work on the first eight values; fdct8(input, even_results); + { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 + tran_high_t t0, t1, t2, t3; // needs32 + tran_high_t x0, x1, x2, x3; // canbe16 + + // stage 1 + s0 = in_high[0] + in_high[7]; + s1 = in_high[1] + in_high[6]; + s2 = in_high[2] + in_high[5]; + s3 = in_high[3] + in_high[4]; + s4 = in_high[3] - in_high[4]; + s5 = in_high[2] - in_high[5]; + s6 = in_high[1] - in_high[6]; + s7 = in_high[0] - in_high[7]; + + // fdct4(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; + t3 = x3 * cospi_24_64 - x2 * cospi_8_64; + out[0] = (tran_low_t)fdct_round_shift(t0); + out[4] = (tran_low_t)fdct_round_shift(t2); + out[8] = (tran_low_t)fdct_round_shift(t1); + out[12] = (tran_low_t)fdct_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = fdct_round_shift(t0); + t3 = fdct_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + out[2] = (tran_low_t)fdct_round_shift(t0); + out[6] = (tran_low_t)fdct_round_shift(t2); + out[10] = (tran_low_t)fdct_round_shift(t1); + out[14] = (tran_low_t)fdct_round_shift(t3); + } + // Work on the next eight values; step1 -> odd_results + { + // step 2 + temp1 = (step1[5] - step1[2]) * cospi_16_64; + temp2 = (step1[4] - step1[3]) * cospi_16_64; + step2[2] = fdct_round_shift(temp1); + step2[3] = fdct_round_shift(temp2); + temp1 = (step1[4] + step1[3]) * cospi_16_64; + temp2 = (step1[5] + step1[2]) * cospi_16_64; + step2[4] = fdct_round_shift(temp1); + step2[5] = fdct_round_shift(temp2); + // step 3 + step3[0] = step1[0] + step2[3]; + step3[1] = step1[1] + step2[2]; + step3[2] = step1[1] - step2[2]; + step3[3] = step1[0] - step2[3]; + step3[4] = step1[7] - step2[4]; + step3[5] = step1[6] - step2[5]; + step3[6] = step1[6] + step2[5]; + step3[7] = step1[7] + step2[4]; + // step 4 + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; + temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; + step2[1] = fdct_round_shift(temp1); + step2[2] = fdct_round_shift(temp2); + temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; + step2[5] = fdct_round_shift(temp1); + step2[6] = fdct_round_shift(temp2); + // step 5 + step1[0] = step3[0] + step2[1]; + step1[1] = step3[0] - step2[1]; + step1[2] = step3[3] + step2[2]; + step1[3] = step3[3] - step2[2]; + step1[4] = step3[4] - step2[5]; + step1[5] = step3[4] + step2[5]; + step1[6] = step3[7] - step2[6]; + step1[7] = step3[7] + step2[6]; + // step 6 + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; + temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; + out[1] = (tran_low_t)fdct_round_shift(temp1); + out[9] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; + out[5] = (tran_low_t)fdct_round_shift(temp1); + out[13] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; + temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; + out[3] = (tran_low_t)fdct_round_shift(temp1); + out[11] = (tran_low_t)fdct_round_shift(temp2); + temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; + out[7] = (tran_low_t)fdct_round_shift(temp1); + out[15] = (tran_low_t)fdct_round_shift(temp2); + } + // Do next column (which is a transposed row in second/horizontal pass) + input++; + out += 16; + } + // Setup in/out for next pass. + in_low = intermediate; + out = output; + } +} + +void aom_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + int sum = 0; + for (r = 0; r < 16; ++r) + for (c = 0; c < 16; ++c) sum += input[r * stride + c]; + + output[0] = (tran_low_t)(sum >> 1); +} + +static INLINE tran_high_t dct_32_round(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + // TODO(debargha, peter.derivaz): Find new bounds for this assert, + // and make the bounds consts. + // assert(-131072 <= rv && rv <= 131071); + return rv; +} + +static INLINE tran_high_t half_round_shift(tran_high_t input) { + tran_high_t rv = (input + 1 + (input < 0)) >> 2; + return rv; +} + +void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round) { + tran_high_t step[32]; + // Stage 1 + step[0] = input[0] + input[(32 - 1)]; + step[1] = input[1] + input[(32 - 2)]; + step[2] = input[2] + input[(32 - 3)]; + step[3] = input[3] + input[(32 - 4)]; + step[4] = input[4] + input[(32 - 5)]; + step[5] = input[5] + input[(32 - 6)]; + step[6] = input[6] + input[(32 - 7)]; + step[7] = input[7] + input[(32 - 8)]; + step[8] = input[8] + input[(32 - 9)]; + step[9] = input[9] + input[(32 - 10)]; + step[10] = input[10] + input[(32 - 11)]; + step[11] = input[11] + input[(32 - 12)]; + step[12] = input[12] + input[(32 - 13)]; + step[13] = input[13] + input[(32 - 14)]; + step[14] = input[14] + input[(32 - 15)]; + step[15] = input[15] + input[(32 - 16)]; + step[16] = -input[16] + input[(32 - 17)]; + step[17] = -input[17] + input[(32 - 18)]; + step[18] = -input[18] + input[(32 - 19)]; + step[19] = -input[19] + input[(32 - 20)]; + step[20] = -input[20] + input[(32 - 21)]; + step[21] = -input[21] + input[(32 - 22)]; + step[22] = -input[22] + input[(32 - 23)]; + step[23] = -input[23] + input[(32 - 24)]; + step[24] = -input[24] + input[(32 - 25)]; + step[25] = -input[25] + input[(32 - 26)]; + step[26] = -input[26] + input[(32 - 27)]; + step[27] = -input[27] + input[(32 - 28)]; + step[28] = -input[28] + input[(32 - 29)]; + step[29] = -input[29] + input[(32 - 30)]; + step[30] = -input[30] + input[(32 - 31)]; + step[31] = -input[31] + input[(32 - 32)]; + + // Stage 2 + output[0] = step[0] + step[16 - 1]; + output[1] = step[1] + step[16 - 2]; + output[2] = step[2] + step[16 - 3]; + output[3] = step[3] + step[16 - 4]; + output[4] = step[4] + step[16 - 5]; + output[5] = step[5] + step[16 - 6]; + output[6] = step[6] + step[16 - 7]; + output[7] = step[7] + step[16 - 8]; + output[8] = -step[8] + step[16 - 9]; + output[9] = -step[9] + step[16 - 10]; + output[10] = -step[10] + step[16 - 11]; + output[11] = -step[11] + step[16 - 12]; + output[12] = -step[12] + step[16 - 13]; + output[13] = -step[13] + step[16 - 14]; + output[14] = -step[14] + step[16 - 15]; + output[15] = -step[15] + step[16 - 16]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = step[18]; + output[19] = step[19]; + + output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); + output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); + output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); + output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); + + output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); + output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); + output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); + output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); + + output[28] = step[28]; + output[29] = step[29]; + output[30] = step[30]; + output[31] = step[31]; + + // dump the magnitude by 4, hence the intermediate values are within + // the range of 16 bits. + if (round) { + output[0] = half_round_shift(output[0]); + output[1] = half_round_shift(output[1]); + output[2] = half_round_shift(output[2]); + output[3] = half_round_shift(output[3]); + output[4] = half_round_shift(output[4]); + output[5] = half_round_shift(output[5]); + output[6] = half_round_shift(output[6]); + output[7] = half_round_shift(output[7]); + output[8] = half_round_shift(output[8]); + output[9] = half_round_shift(output[9]); + output[10] = half_round_shift(output[10]); + output[11] = half_round_shift(output[11]); + output[12] = half_round_shift(output[12]); + output[13] = half_round_shift(output[13]); + output[14] = half_round_shift(output[14]); + output[15] = half_round_shift(output[15]); + + output[16] = half_round_shift(output[16]); + output[17] = half_round_shift(output[17]); + output[18] = half_round_shift(output[18]); + output[19] = half_round_shift(output[19]); + output[20] = half_round_shift(output[20]); + output[21] = half_round_shift(output[21]); + output[22] = half_round_shift(output[22]); + output[23] = half_round_shift(output[23]); + output[24] = half_round_shift(output[24]); + output[25] = half_round_shift(output[25]); + output[26] = half_round_shift(output[26]); + output[27] = half_round_shift(output[27]); + output[28] = half_round_shift(output[28]); + output[29] = half_round_shift(output[29]); + output[30] = half_round_shift(output[30]); + output[31] = half_round_shift(output[31]); + } + + // Stage 3 + step[0] = output[0] + output[(8 - 1)]; + step[1] = output[1] + output[(8 - 2)]; + step[2] = output[2] + output[(8 - 3)]; + step[3] = output[3] + output[(8 - 4)]; + step[4] = -output[4] + output[(8 - 5)]; + step[5] = -output[5] + output[(8 - 6)]; + step[6] = -output[6] + output[(8 - 7)]; + step[7] = -output[7] + output[(8 - 8)]; + step[8] = output[8]; + step[9] = output[9]; + step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); + step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); + step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); + step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); + step[14] = output[14]; + step[15] = output[15]; + + step[16] = output[16] + output[23]; + step[17] = output[17] + output[22]; + step[18] = output[18] + output[21]; + step[19] = output[19] + output[20]; + step[20] = -output[20] + output[19]; + step[21] = -output[21] + output[18]; + step[22] = -output[22] + output[17]; + step[23] = -output[23] + output[16]; + step[24] = -output[24] + output[31]; + step[25] = -output[25] + output[30]; + step[26] = -output[26] + output[29]; + step[27] = -output[27] + output[28]; + step[28] = output[28] + output[27]; + step[29] = output[29] + output[26]; + step[30] = output[30] + output[25]; + step[31] = output[31] + output[24]; + + // Stage 4 + output[0] = step[0] + step[3]; + output[1] = step[1] + step[2]; + output[2] = -step[2] + step[1]; + output[3] = -step[3] + step[0]; + output[4] = step[4]; + output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); + output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); + output[7] = step[7]; + output[8] = step[8] + step[11]; + output[9] = step[9] + step[10]; + output[10] = -step[10] + step[9]; + output[11] = -step[11] + step[8]; + output[12] = -step[12] + step[15]; + output[13] = -step[13] + step[14]; + output[14] = step[14] + step[13]; + output[15] = step[15] + step[12]; + + output[16] = step[16]; + output[17] = step[17]; + output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); + output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); + output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); + output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); + output[22] = step[22]; + output[23] = step[23]; + output[24] = step[24]; + output[25] = step[25]; + output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); + output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); + output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); + output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); + output[30] = step[30]; + output[31] = step[31]; + + // Stage 5 + step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); + step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); + step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); + step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); + step[4] = output[4] + output[5]; + step[5] = -output[5] + output[4]; + step[6] = -output[6] + output[7]; + step[7] = output[7] + output[6]; + step[8] = output[8]; + step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); + step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); + step[11] = output[11]; + step[12] = output[12]; + step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); + step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); + step[15] = output[15]; + + step[16] = output[16] + output[19]; + step[17] = output[17] + output[18]; + step[18] = -output[18] + output[17]; + step[19] = -output[19] + output[16]; + step[20] = -output[20] + output[23]; + step[21] = -output[21] + output[22]; + step[22] = output[22] + output[21]; + step[23] = output[23] + output[20]; + step[24] = output[24] + output[27]; + step[25] = output[25] + output[26]; + step[26] = -output[26] + output[25]; + step[27] = -output[27] + output[24]; + step[28] = -output[28] + output[31]; + step[29] = -output[29] + output[30]; + step[30] = output[30] + output[29]; + step[31] = output[31] + output[28]; + + // Stage 6 + output[0] = step[0]; + output[1] = step[1]; + output[2] = step[2]; + output[3] = step[3]; + output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); + output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); + output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); + output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); + output[8] = step[8] + step[9]; + output[9] = -step[9] + step[8]; + output[10] = -step[10] + step[11]; + output[11] = step[11] + step[10]; + output[12] = step[12] + step[13]; + output[13] = -step[13] + step[12]; + output[14] = -step[14] + step[15]; + output[15] = step[15] + step[14]; + + output[16] = step[16]; + output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); + output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); + output[19] = step[19]; + output[20] = step[20]; + output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); + output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); + output[23] = step[23]; + output[24] = step[24]; + output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); + output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); + output[27] = step[27]; + output[28] = step[28]; + output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); + output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); + output[31] = step[31]; + + // Stage 7 + step[0] = output[0]; + step[1] = output[1]; + step[2] = output[2]; + step[3] = output[3]; + step[4] = output[4]; + step[5] = output[5]; + step[6] = output[6]; + step[7] = output[7]; + step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); + step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); + step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); + step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); + step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); + step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); + step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); + step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); + + step[16] = output[16] + output[17]; + step[17] = -output[17] + output[16]; + step[18] = -output[18] + output[19]; + step[19] = output[19] + output[18]; + step[20] = output[20] + output[21]; + step[21] = -output[21] + output[20]; + step[22] = -output[22] + output[23]; + step[23] = output[23] + output[22]; + step[24] = output[24] + output[25]; + step[25] = -output[25] + output[24]; + step[26] = -output[26] + output[27]; + step[27] = output[27] + output[26]; + step[28] = output[28] + output[29]; + step[29] = -output[29] + output[28]; + step[30] = -output[30] + output[31]; + step[31] = output[31] + output[30]; + + // Final stage --- outputs indices are bit-reversed. + output[0] = step[0]; + output[16] = step[1]; + output[8] = step[2]; + output[24] = step[3]; + output[4] = step[4]; + output[20] = step[5]; + output[12] = step[6]; + output[28] = step[7]; + output[2] = step[8]; + output[18] = step[9]; + output[10] = step[10]; + output[26] = step[11]; + output[6] = step[12]; + output[22] = step[13]; + output[14] = step[14]; + output[30] = step[15]; + + output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); + output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); + output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); + output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); + output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); + output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); + output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); + output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); + output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); + output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); + output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); + output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); + output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); + output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); + output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); + output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); +} + +void aom_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { + int i, j; + tran_high_t output[32 * 32]; + + // Columns + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; + aom_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + aom_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + out[j + i * 32] = + (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); + } +} + +// Note that although we use dct_32_round in dct32 computation flow, +// this 2d fdct32x32 for rate-distortion optimization loop is operating +// within 16 bits precision. +void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { + int i, j; + tran_high_t output[32 * 32]; + + // Columns + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; + aom_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + // TODO(cd): see quality impact of only doing + // output[j * 32 + i] = (temp_out[j] + 1) >> 2; + // PS: also change code in aom_dsp/x86/aom_dct_sse2.c + output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; + } + + // Rows + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; + aom_fdct32(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; + } +} + +void aom_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { + int r, c; + int sum = 0; + for (r = 0; r < 32; ++r) + for (c = 0; c < 32; ++c) sum += input[r * stride + c]; + + output[0] = (tran_low_t)(sum >> 3); +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, + int stride) { + aom_fdct4x4_c(input, output, stride); +} + +void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, + int stride) { + aom_fdct8x8_c(input, final_output, stride); +} + +void aom_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, + int stride) { + aom_fdct8x8_1_c(input, final_output, stride); +} + +void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, + int stride) { + aom_fdct16x16_c(input, output, stride); +} + +void aom_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, + int stride) { + aom_fdct16x16_1_c(input, output, stride); +} + +void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { + aom_fdct32x32_c(input, out, stride); +} + +void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, + int stride) { + aom_fdct32x32_rd_c(input, out, stride); +} + +void aom_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, + int stride) { + aom_fdct32x32_1_c(input, out, stride); +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/fwd_txfm.h b/third_party/aom/aom_dsp/fwd_txfm.h new file mode 100644 index 000000000..579dbd06e --- /dev/null +++ b/third_party/aom/aom_dsp/fwd_txfm.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_FWD_TXFM_H_ +#define AOM_DSP_FWD_TXFM_H_ + +#include "aom_dsp/txfm_common.h" + +static INLINE tran_high_t saturate_int16(tran_high_t value) { + tran_high_t result; + result = value > INT16_MAX ? INT16_MAX : value; + return result < INT16_MIN ? INT16_MIN : result; +} + +static INLINE tran_high_t fdct_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return rv; +} + +void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round); +#endif // AOM_DSP_FWD_TXFM_H_ diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c new file mode 100644 index 000000000..1f0870b64 --- /dev/null +++ b/third_party/aom/aom_dsp/intrapred.c @@ -0,0 +1,971 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" + +#define DST(x, y) dst[(x) + (y)*stride] +#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) +#define AVG2(a, b) (((a) + (b) + 1) >> 1) + +static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void)above; + + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], + left[(c >> 1) + r + 2]) + : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); + } + dst += stride; + } +} + +static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void)left; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], + above[(r >> 1) + c + 2]) + : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); + } + dst += stride; + } +} + +static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + (void)left; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = AVG3(above[r + c], above[r + c + 1], + above[r + c + 1 + (r + c + 2 < bs * 2)]); + } + dst += stride; + } +} + +static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + + // first row + for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]); + dst += stride; + + // second row + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + dst += stride; + + // the rest of first col + dst[0] = AVG3(above[-1], left[0], left[1]); + for (r = 3; r < bs; ++r) + dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); + + // the rest of the block + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1]; + dst += stride; + } +} + +static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i; +#if CONFIG_TX64X64 +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 + // silence a spurious -Warray-bounds warning, possibly related to: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 + uint8_t border[133]; +#else + uint8_t border[64 + 64 - 1]; // outer border from bottom-left to top-right +#endif +#else +#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 + // silence a spurious -Warray-bounds warning, possibly related to: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 + uint8_t border[69]; +#else + uint8_t border[32 + 32 - 1]; // outer border from bottom-left to top-right +#endif +#endif // CONFIG_TX64X64 + + // dst(bs, bs - 2)[0], i.e., border starting at bottom-left + for (i = 0; i < bs - 2; ++i) { + border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); + } + border[bs - 2] = AVG3(above[-1], left[0], left[1]); + border[bs - 1] = AVG3(left[0], above[-1], above[0]); + border[bs - 0] = AVG3(above[-1], above[0], above[1]); + // dst[0][2, size), i.e., remaining top border ascending + for (i = 0; i < bs - 2; ++i) { + border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]); + } + + for (i = 0; i < bs; ++i) { + memcpy(dst + i * stride, border + bs - 1 - i, bs); + } +} + +static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + dst[0] = AVG2(above[-1], left[0]); + for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); + dst++; + + dst[0] = AVG3(left[0], above[-1], above[0]); + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; r++) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + dst++; + + for (c = 0; c < bs - 2; c++) + dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); + dst += stride; + + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2]; + dst += stride; + } +} + +static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void)left; + + for (r = 0; r < bs; r++) { + memcpy(dst, above, bs); + dst += stride; + } +} + +static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void)above; + + for (r = 0; r < bs; r++) { + memset(dst, left[r], bs); + dst += stride; + } +} + +#if CONFIG_ALT_INTRA +static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; } + +static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top, + uint16_t top_left) { + const int base = top + left - top_left; + const int p_left = abs_diff(base, left); + const int p_top = abs_diff(base, top); + const int p_top_left = abs_diff(base, top_left); + + // Return nearest to base of left, top and top_left. + return (p_left <= p_top && p_left <= p_top_left) + ? left + : (p_top <= p_top_left) ? top : top_left; +} + +static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + const uint8_t ytop_left = above[-1]; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left); + dst += stride; + } +} + +// Weights are quadratic from '1' to '1 / block_size', scaled by +// 2^sm_weight_log2_scale. +static const int sm_weight_log2_scale = 8; + +#if CONFIG_TX64X64 +// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST]) +#define MAX_BLOCK_DIM 64 +#define NUM_BLOCK_DIMS 6 // log2(MAX_BLOCK_DIM) +#else +#define MAX_BLOCK_DIM 32 +#define NUM_BLOCK_DIMS 5 +#endif // CONFIG_TX64X64 + +static const uint8_t sm_weight_arrays[NUM_BLOCK_DIMS][MAX_BLOCK_DIM] = { + // bs = 2 + { 255, 128 }, + // bs = 4 + { 255, 149, 85, 64 }, + // bs = 8 + { 255, 197, 146, 105, 73, 50, 37, 32 }, + // bs = 16 + { 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16 }, + // bs = 32 + { + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, + 111, 101, 92, 83, 74, 66, 59, 52, 45, 39, 34, + 29, 25, 21, 17, 14, 12, 10, 9, 8, 8 }, +#if CONFIG_TX64X64 + // bs = 64 + { 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, + 163, 156, 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, + 91, 86, 82, 77, 73, 69, 65, 61, 57, 54, 50, 47, 44, + 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, 13, + 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4 }, +#endif // CONFIG_TX64X64 +}; + +// Some basic checks on weights for smooth predictor. +#define sm_weights_sanity_checks(weights, weights_scale, pred_scale) \ + assert(weights[0] < weights_scale); \ + assert(weights_scale - weights[bs - 1] < weights_scale); \ + assert(pred_scale < 31) // ensures no overflow when calculating predictor. + +#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits)) + +static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + const uint8_t below_pred = left[bs - 1]; // estimated by bottom-left pixel + const uint8_t right_pred = above[bs - 1]; // estimated by top-right pixel + const int arr_index = get_msb(bs) - 1; + assert(arr_index >= 0); + assert(arr_index < NUM_BLOCK_DIMS); + const uint8_t *const sm_weights = sm_weight_arrays[arr_index]; + // scale = 2 * 2^sm_weight_log2_scale + const int log2_scale = 1 + sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + int r; + for (r = 0; r < bs; ++r) { + int c; + for (c = 0; c < bs; ++c) { + const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred }; + const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r], + sm_weights[c], scale - sm_weights[c] }; + uint32_t this_pred = 0; + int i; + assert(scale >= sm_weights[r] && scale >= sm_weights[c]); + for (i = 0; i < 4; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = clip_pixel(divide_round(this_pred, log2_scale)); + } + dst += stride; + } +} + +#else + +static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r, c; + int ytop_left = above[-1]; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = clip_pixel(left[r] + above[c] - ytop_left); + dst += stride; + } +} +#endif // CONFIG_ALT_INTRA + +static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int r; + (void)above; + (void)left; + + for (r = 0; r < bs; r++) { + memset(dst, 128, bs); + dst += stride; + } +} + +static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void)above; + + for (i = 0; i < bs; i++) sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; + (void)left; + + for (i = 0; i < bs; i++) sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + int i, r, expected_dc, sum = 0; + const int count = 2 * bs; + + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + memset(dst, expected_dc, bs); + dst += stride; + } +} + +void aom_d45e_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)stride; + (void)left; + + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(1, 1) = AVG3(C, D, D); +} + +void aom_d117_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + DST(0, 0) = AVG2(X, A); + DST(1, 0) = AVG2(A, B); + DST(0, 1) = AVG3(I, X, A); + DST(1, 1) = AVG3(X, A, B); +} + +void aom_d135_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + (void)stride; + DST(0, 1) = AVG3(X, I, J); + DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(1, 0) = AVG3(B, A, X); +} + +void aom_d153_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int X = above[-1]; + const int A = above[0]; + + DST(0, 0) = AVG2(I, X); + DST(0, 1) = AVG2(J, I); + DST(1, 0) = AVG3(I, X, A); + DST(1, 1) = AVG3(J, I, X); +} + +void aom_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + const int E = above[4]; + const int F = above[5]; + const int G = above[6]; + const int H = above[7]; + (void)stride; + (void)left; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = AVG3(G, H, H); +} + +void aom_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + DST(0, 0) = DST(1, 2) = AVG2(X, A); + DST(1, 0) = DST(2, 2) = AVG2(A, B); + DST(2, 0) = DST(3, 2) = AVG2(B, C); + DST(3, 0) = AVG2(C, D); + + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); + DST(0, 1) = DST(1, 3) = AVG3(I, X, A); + DST(1, 1) = DST(2, 3) = AVG3(X, A, B); + DST(2, 1) = DST(3, 3) = AVG3(A, B, C); + DST(3, 1) = AVG3(B, C, D); +} + +void aom_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)stride; + DST(0, 3) = AVG3(J, K, L); + DST(1, 3) = DST(0, 2) = AVG3(I, J, K); + DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); + DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); + DST(3, 1) = DST(2, 0) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); +} + +void aom_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + + DST(0, 0) = DST(2, 1) = AVG2(I, X); + DST(0, 1) = DST(2, 2) = AVG2(J, I); + DST(0, 2) = DST(2, 3) = AVG2(K, J); + DST(0, 3) = AVG2(L, K); + + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); + DST(1, 0) = DST(3, 1) = AVG3(I, X, A); + DST(1, 1) = DST(3, 2) = AVG3(J, I, X); + DST(1, 2) = DST(3, 3) = AVG3(K, J, I); + DST(1, 3) = AVG3(L, K, J); +} + +#if CONFIG_HIGHBITDEPTH +static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)above; + (void)bd; + + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], + left[(c >> 1) + r + 2]) + : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); + } + dst += stride; + } +} + +static INLINE void highbd_d63e_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)left; + (void)bd; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], + above[(r >> 1) + c + 2]) + : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); + } + dst += stride; + } +} + +static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)left; + (void)bd; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = AVG3(above[r + c], above[r + c + 1], + above[r + c + 1 + (r + c + 2 < bs * 2)]); + } + dst += stride; + } +} + +static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)bd; + + // first row + for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]); + dst += stride; + + // second row + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + dst += stride; + + // the rest of first col + dst[0] = AVG3(above[-1], left[0], left[1]); + for (r = 3; r < bs; ++r) + dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); + + // the rest of the block + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1]; + dst += stride; + } +} + +static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)bd; + dst[0] = AVG3(left[0], above[-1], above[0]); + for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; ++r) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + + dst += stride; + for (r = 1; r < bs; ++r) { + for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1]; + dst += stride; + } +} + +static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void)bd; + dst[0] = AVG2(above[-1], left[0]); + for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); + dst++; + + dst[0] = AVG3(left[0], above[-1], above[0]); + dst[stride] = AVG3(above[-1], left[0], left[1]); + for (r = 2; r < bs; r++) + dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); + dst++; + + for (c = 0; c < bs - 2; c++) + dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); + dst += stride; + + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2]; + dst += stride; + } +} + +static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)left; + (void)bd; + for (r = 0; r < bs; r++) { + memcpy(dst, above, bs * sizeof(uint16_t)); + dst += stride; + } +} + +static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)above; + (void)bd; + for (r = 0; r < bs; r++) { + aom_memset16(dst, left[r], bs); + dst += stride; + } +} + +void aom_highbd_d207_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + (void)above; + (void)bd; + DST(0, 0) = AVG2(I, J); + DST(0, 1) = AVG2(J, K); + DST(1, 0) = AVG3(I, J, K); + DST(1, 1) = AVG3(J, K, L); +} + +void aom_highbd_d63_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)left; + (void)bd; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = AVG2(B, C); + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = AVG3(B, C, D); +} + +void aom_highbd_d45e_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int A = above[0]; + const int B = above[1]; + const int C = above[2]; + const int D = above[3]; + (void)stride; + (void)left; + (void)bd; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(1, 1) = AVG3(C, D, D); +} + +void aom_highbd_d117_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + (void)bd; + DST(0, 0) = AVG2(X, A); + DST(1, 0) = AVG2(A, B); + DST(0, 1) = AVG3(I, X, A); + DST(1, 1) = AVG3(X, A, B); +} + +void aom_highbd_d135_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int X = above[-1]; + const int A = above[0]; + const int B = above[1]; + (void)bd; + DST(0, 1) = AVG3(X, I, J); + DST(1, 1) = DST(0, 0) = AVG3(A, X, I); + DST(1, 0) = AVG3(B, A, X); +} + +void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int X = above[-1]; + const int A = above[0]; + (void)bd; + DST(0, 0) = AVG2(I, X); + DST(0, 1) = AVG2(J, I); + DST(1, 0) = AVG3(I, X, A); + DST(1, 1) = AVG3(J, I, X); +} + +#if CONFIG_ALT_INTRA +static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + const uint16_t ytop_left = above[-1]; + (void)bd; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = paeth_predictor_single(left[r], above[c], ytop_left); + dst += stride; + } +} + +static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t below_pred = left[bs - 1]; // estimated by bottom-left pixel + const uint16_t right_pred = above[bs - 1]; // estimated by top-right pixel + const int arr_index = get_msb(bs) - 1; + assert(arr_index >= 0); + assert(arr_index < NUM_BLOCK_DIMS); + const uint8_t *const sm_weights = sm_weight_arrays[arr_index]; + // scale = 2 * 2^sm_weight_log2_scale + const int log2_scale = 1 + sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + int r; + for (r = 0; r < bs; ++r) { + int c; + for (c = 0; c < bs; ++c) { + const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred }; + const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r], + sm_weights[c], scale - sm_weights[c] }; + uint32_t this_pred = 0; + int i; + assert(scale >= sm_weights[r] && scale >= sm_weights[c]); + for (i = 0; i < 4; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd); + } + dst += stride; + } +} + +#else +static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + int ytop_left = above[-1]; + (void)bd; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd); + dst += stride; + } +} +#endif // CONFIG_ALT_INTRA + +static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void)above; + (void)left; + + for (r = 0; r < bs; r++) { + aom_memset16(dst, 128 << (bd - 8), bs); + dst += stride; + } +} + +static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void)above; + (void)bd; + + for (i = 0; i < bs; i++) sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + aom_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void)left; + (void)bd; + + for (i = 0; i < bs; i++) sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + aom_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + const int count = 2 * bs; + (void)bd; + + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + aom_memset16(dst, expected_dc, bs); + dst += stride; + } +} +#endif // CONFIG_HIGHBITDEPTH + +// This serves as a wrapper function, so that all the prediction functions +// can be unified and accessed as a pointer array. Note that the boundary +// above and left are not necessarily used all the time. +#define intra_pred_sized(type, size) \ + void aom_##type##_predictor_##size##x##size##_c( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \ + const uint8_t *left) { \ + type##_predictor(dst, stride, size, above, left); \ + } + +#if CONFIG_HIGHBITDEPTH +#define intra_pred_highbd_sized(type, size) \ + void aom_highbd_##type##_predictor_##size##x##size##_c( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + highbd_##type##_predictor(dst, stride, size, above, left, bd); \ + } + +/* clang-format off */ +#if CONFIG_TX64X64 +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 2) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_sized(type, 64) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) \ + intra_pred_highbd_sized(type, 64) + +#define intra_pred_above_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_sized(type, 64) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) \ + intra_pred_highbd_sized(type, 64) +#else // CONFIG_TX64X64 +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 2) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_highbd_sized(type, 2) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) + +#define intra_pred_above_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_highbd_sized(type, 4) \ + intra_pred_highbd_sized(type, 8) \ + intra_pred_highbd_sized(type, 16) \ + intra_pred_highbd_sized(type, 32) +#endif // CONFIG_TX64X64 + +#else + +#if CONFIG_TX64X64 +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 2) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_sized(type, 64) + +#define intra_pred_above_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_sized(type, 64) +#else // CONFIG_TX64X64 +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 2) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) + +#define intra_pred_above_4x4(type) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) +#endif // CONFIG_TX64X64 +#endif // CONFIG_HIGHBITDEPTH + +intra_pred_allsizes(d207e) +intra_pred_allsizes(d63e) +intra_pred_above_4x4(d45e) +intra_pred_above_4x4(d117) +intra_pred_above_4x4(d135) +intra_pred_above_4x4(d153) +intra_pred_allsizes(v) +intra_pred_allsizes(h) +#if CONFIG_ALT_INTRA +intra_pred_allsizes(paeth) +intra_pred_allsizes(smooth) +#else +intra_pred_allsizes(tm) +#endif // CONFIG_ALT_INTRA +intra_pred_allsizes(dc_128) +intra_pred_allsizes(dc_left) +intra_pred_allsizes(dc_top) +intra_pred_allsizes(dc) +/* clang-format on */ +#undef intra_pred_allsizes diff --git a/third_party/aom/aom_dsp/inv_txfm.c b/third_party/aom/aom_dsp/inv_txfm.c new file mode 100644 index 000000000..bb995856a --- /dev/null +++ b/third_party/aom/aom_dsp/inv_txfm.c @@ -0,0 +1,1445 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/inv_txfm.h" + +void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = WRAPLOW(a1); + op[1] = WRAPLOW(b1); + op[2] = WRAPLOW(c1); + op[3] = WRAPLOW(d1); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1)); + dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1)); + dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1)); + dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1)); + + ip++; + dest++; + } +} + +void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = WRAPLOW(a1); + op[1] = op[2] = op[3] = WRAPLOW(e1); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); + dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); + dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); + dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); + ip++; + dest++; + } +} + +void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = WRAPLOW(dct_const_round_shift(temp1)); + step[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = WRAPLOW(dct_const_round_shift(temp1)); + step[3] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + output[0] = WRAPLOW(step[0] + step[3]); + output[1] = WRAPLOW(step[1] + step[2]); + output[2] = WRAPLOW(step[1] - step[2]); + output[3] = WRAPLOW(step[0] - step[3]); +} + +void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + + // Rows + for (i = 0; i < 4; ++i) { + aom_idct4_c(input, outptr); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + aom_idct4_c(temp_in, temp_out); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 4)); + } + } +} + +void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, + int dest_stride) { + int i; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 4); + + if (a1 == 0) return; + + for (i = 0; i < 4; i++) { + dest[0] = clip_pixel_add(dest[0], a1); + dest[1] = clip_pixel_add(dest[1], a1); + dest[2] = clip_pixel_add(dest[2], a1); + dest[3] = clip_pixel_add(dest[3], a1); + dest += dest_stride; + } +} + +void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + temp1 = (step1[0] + step1[2]) * cospi_16_64; + temp2 = (step1[0] - step1[2]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + // stage 3 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + // stage 4 + output[0] = WRAPLOW(step1[0] + step1[7]); + output[1] = WRAPLOW(step1[1] + step1[6]); + output[2] = WRAPLOW(step1[2] + step1[5]); + output[3] = WRAPLOW(step1[3] + step1[4]); + output[4] = WRAPLOW(step1[3] - step1[4]); + output[5] = WRAPLOW(step1[2] - step1[5]); + output[6] = WRAPLOW(step1[1] - step1[6]); + output[7] = WRAPLOW(step1[0] - step1[7]); +} + +void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[8 * 8]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + for (i = 0; i < 8; ++i) { + aom_idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + aom_idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 5); + if (a1 == 0) return; + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = WRAPLOW(x0 - x2 + x3); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); + output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); + output[2] = WRAPLOW(dct_const_round_shift(s2)); + output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); +} + +void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); + s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); + s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); + s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); + s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); + s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); + s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); + s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); + + x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); + + // stage 2 + s0 = (int)x0; + s1 = (int)x1; + s2 = (int)x2; + s3 = (int)x3; + s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); + s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); + s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); + s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + + // stage 3 + s2 = (int)(cospi_16_64 * (x2 + x3)); + s3 = (int)(cospi_16_64 * (x2 - x3)); + s6 = (int)(cospi_16_64 * (x6 + x7)); + s7 = (int)(cospi_16_64 * (x6 - x7)); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x4); + output[2] = WRAPLOW(x6); + output[3] = WRAPLOW(-x2); + output[4] = WRAPLOW(x3); + output[5] = WRAPLOW(-x7); + output[6] = WRAPLOW(x5); + output[7] = WRAPLOW(-x1); +} + +void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + // only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + aom_idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + aom_idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + +void aom_idct16_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[16], step2[16]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = input[0 / 2]; + step1[1] = input[16 / 2]; + step1[2] = input[8 / 2]; + step1[3] = input[24 / 2]; + step1[4] = input[4 / 2]; + step1[5] = input[20 / 2]; + step1[6] = input[12 / 2]; + step1[7] = input[28 / 2]; + step1[8] = input[2 / 2]; + step1[9] = input[18 / 2]; + step1[10] = input[10 / 2]; + step1[11] = input[26 / 2]; + step1[12] = input[6 / 2]; + step1[13] = input[22 / 2]; + step1[14] = input[14 / 2]; + step1[15] = input[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = WRAPLOW(step2[0] + step2[15]); + output[1] = WRAPLOW(step2[1] + step2[14]); + output[2] = WRAPLOW(step2[2] + step2[13]); + output[3] = WRAPLOW(step2[3] + step2[12]); + output[4] = WRAPLOW(step2[4] + step2[11]); + output[5] = WRAPLOW(step2[5] + step2[10]); + output[6] = WRAPLOW(step2[6] + step2[9]); + output[7] = WRAPLOW(step2[7] + step2[8]); + output[8] = WRAPLOW(step2[7] - step2[8]); + output[9] = WRAPLOW(step2[6] - step2[9]); + output[10] = WRAPLOW(step2[5] - step2[10]); + output[11] = WRAPLOW(step2[4] - step2[11]); + output[12] = WRAPLOW(step2[3] - step2[12]); + output[13] = WRAPLOW(step2[2] - step2[13]); + output[14] = WRAPLOW(step2[1] - step2[14]); + output[15] = WRAPLOW(step2[0] - step2[15]); +} + +void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[16 * 16]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows + for (i = 0; i < 16; ++i) { + aom_idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + aom_idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_high_t x0 = input[15]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[13]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[11]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[9]; + tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = output[8] = output[9] = output[10] = + output[11] = output[12] = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = WRAPLOW(s0 + s4); + x1 = WRAPLOW(s1 + s5); + x2 = WRAPLOW(s2 + s6); + x3 = WRAPLOW(s3 + s7); + x4 = WRAPLOW(s0 - s4); + x5 = WRAPLOW(s1 - s5); + x6 = WRAPLOW(s2 - s6); + x7 = WRAPLOW(s3 - s7); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + x8 = WRAPLOW(s8 + s10); + x9 = WRAPLOW(s9 + s11); + x10 = WRAPLOW(s8 - s10); + x11 = WRAPLOW(s9 - s11); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + x10 = WRAPLOW(dct_const_round_shift(s10)); + x11 = WRAPLOW(dct_const_round_shift(s11)); + x14 = WRAPLOW(dct_const_round_shift(s14)); + x15 = WRAPLOW(dct_const_round_shift(s15)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x8); + output[2] = WRAPLOW(x12); + output[3] = WRAPLOW(-x4); + output[4] = WRAPLOW(x6); + output[5] = WRAPLOW(x14); + output[6] = WRAPLOW(x10); + output[7] = WRAPLOW(x2); + output[8] = WRAPLOW(x3); + output[9] = WRAPLOW(x11); + output[10] = WRAPLOW(x15); + output[11] = WRAPLOW(x7); + output[12] = WRAPLOW(x5); + output[13] = WRAPLOW(-x13); + output[14] = WRAPLOW(x9); + output[15] = WRAPLOW(-x1); +} + +void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) { + aom_idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + aom_idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[16], temp_out[16]; + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + for (i = 0; i < 4; ++i) { + aom_idct16_c(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + aom_idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + if (a1 == 0) return; + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +void aom_idct32_c(const tran_low_t *input, tran_low_t *output) { + tran_low_t step1[32], step2[32]; + tran_high_t temp1, temp2; + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[16] = WRAPLOW(dct_const_round_shift(temp1)); + step1[31] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = WRAPLOW(dct_const_round_shift(temp1)); + step2[15] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + + step2[16] = WRAPLOW(step1[16] + step1[17]); + step2[17] = WRAPLOW(step1[16] - step1[17]); + step2[18] = WRAPLOW(-step1[18] + step1[19]); + step2[19] = WRAPLOW(step1[18] + step1[19]); + step2[20] = WRAPLOW(step1[20] + step1[21]); + step2[21] = WRAPLOW(step1[20] - step1[21]); + step2[22] = WRAPLOW(-step1[22] + step1[23]); + step2[23] = WRAPLOW(step1[22] + step1[23]); + step2[24] = WRAPLOW(step1[24] + step1[25]); + step2[25] = WRAPLOW(step1[24] - step1[25]); + step2[26] = WRAPLOW(-step1[26] + step1[27]); + step2[27] = WRAPLOW(step1[26] + step1[27]); + step2[28] = WRAPLOW(step1[28] + step1[29]); + step2[29] = WRAPLOW(step1[28] - step1[29]); + step2[30] = WRAPLOW(-step1[30] + step1[31]); + step2[31] = WRAPLOW(step1[30] + step1[31]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = WRAPLOW(dct_const_round_shift(temp1)); + step1[7] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + + step1[8] = WRAPLOW(step2[8] + step2[9]); + step1[9] = WRAPLOW(step2[8] - step2[9]); + step1[10] = WRAPLOW(-step2[10] + step2[11]); + step1[11] = WRAPLOW(step2[10] + step2[11]); + step1[12] = WRAPLOW(step2[12] + step2[13]); + step1[13] = WRAPLOW(step2[12] - step2[13]); + step1[14] = WRAPLOW(-step2[14] + step2[15]); + step1[15] = WRAPLOW(step2[14] + step2[15]); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = WRAPLOW(dct_const_round_shift(temp1)); + step1[30] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = WRAPLOW(dct_const_round_shift(temp1)); + step2[1] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = WRAPLOW(dct_const_round_shift(temp1)); + step2[3] = WRAPLOW(dct_const_round_shift(temp2)); + step2[4] = WRAPLOW(step1[4] + step1[5]); + step2[5] = WRAPLOW(step1[4] - step1[5]); + step2[6] = WRAPLOW(-step1[6] + step1[7]); + step2[7] = WRAPLOW(step1[6] + step1[7]); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = WRAPLOW(dct_const_round_shift(temp1)); + step2[14] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = WRAPLOW(step1[16] + step1[19]); + step2[17] = WRAPLOW(step1[17] + step1[18]); + step2[18] = WRAPLOW(step1[17] - step1[18]); + step2[19] = WRAPLOW(step1[16] - step1[19]); + step2[20] = WRAPLOW(-step1[20] + step1[23]); + step2[21] = WRAPLOW(-step1[21] + step1[22]); + step2[22] = WRAPLOW(step1[21] + step1[22]); + step2[23] = WRAPLOW(step1[20] + step1[23]); + + step2[24] = WRAPLOW(step1[24] + step1[27]); + step2[25] = WRAPLOW(step1[25] + step1[26]); + step2[26] = WRAPLOW(step1[25] - step1[26]); + step2[27] = WRAPLOW(step1[24] - step1[27]); + step2[28] = WRAPLOW(-step1[28] + step1[31]); + step2[29] = WRAPLOW(-step1[29] + step1[30]); + step2[30] = WRAPLOW(step1[29] + step1[30]); + step2[31] = WRAPLOW(step1[28] + step1[31]); + + // stage 5 + step1[0] = WRAPLOW(step2[0] + step2[3]); + step1[1] = WRAPLOW(step2[1] + step2[2]); + step1[2] = WRAPLOW(step2[1] - step2[2]); + step1[3] = WRAPLOW(step2[0] - step2[3]); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = WRAPLOW(dct_const_round_shift(temp1)); + step1[6] = WRAPLOW(dct_const_round_shift(temp2)); + step1[7] = step2[7]; + + step1[8] = WRAPLOW(step2[8] + step2[11]); + step1[9] = WRAPLOW(step2[9] + step2[10]); + step1[10] = WRAPLOW(step2[9] - step2[10]); + step1[11] = WRAPLOW(step2[8] - step2[11]); + step1[12] = WRAPLOW(-step2[12] + step2[15]); + step1[13] = WRAPLOW(-step2[13] + step2[14]); + step1[14] = WRAPLOW(step2[13] + step2[14]); + step1[15] = WRAPLOW(step2[12] + step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = WRAPLOW(dct_const_round_shift(temp1)); + step1[29] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = WRAPLOW(dct_const_round_shift(temp1)); + step1[28] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = WRAPLOW(step1[0] + step1[7]); + step2[1] = WRAPLOW(step1[1] + step1[6]); + step2[2] = WRAPLOW(step1[2] + step1[5]); + step2[3] = WRAPLOW(step1[3] + step1[4]); + step2[4] = WRAPLOW(step1[3] - step1[4]); + step2[5] = WRAPLOW(step1[2] - step1[5]); + step2[6] = WRAPLOW(step1[1] - step1[6]); + step2[7] = WRAPLOW(step1[0] - step1[7]); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = WRAPLOW(dct_const_round_shift(temp1)); + step2[13] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = WRAPLOW(dct_const_round_shift(temp1)); + step2[12] = WRAPLOW(dct_const_round_shift(temp2)); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = WRAPLOW(step1[16] + step1[23]); + step2[17] = WRAPLOW(step1[17] + step1[22]); + step2[18] = WRAPLOW(step1[18] + step1[21]); + step2[19] = WRAPLOW(step1[19] + step1[20]); + step2[20] = WRAPLOW(step1[19] - step1[20]); + step2[21] = WRAPLOW(step1[18] - step1[21]); + step2[22] = WRAPLOW(step1[17] - step1[22]); + step2[23] = WRAPLOW(step1[16] - step1[23]); + + step2[24] = WRAPLOW(-step1[24] + step1[31]); + step2[25] = WRAPLOW(-step1[25] + step1[30]); + step2[26] = WRAPLOW(-step1[26] + step1[29]); + step2[27] = WRAPLOW(-step1[27] + step1[28]); + step2[28] = WRAPLOW(step1[27] + step1[28]); + step2[29] = WRAPLOW(step1[26] + step1[29]); + step2[30] = WRAPLOW(step1[25] + step1[30]); + step2[31] = WRAPLOW(step1[24] + step1[31]); + + // stage 7 + step1[0] = WRAPLOW(step2[0] + step2[15]); + step1[1] = WRAPLOW(step2[1] + step2[14]); + step1[2] = WRAPLOW(step2[2] + step2[13]); + step1[3] = WRAPLOW(step2[3] + step2[12]); + step1[4] = WRAPLOW(step2[4] + step2[11]); + step1[5] = WRAPLOW(step2[5] + step2[10]); + step1[6] = WRAPLOW(step2[6] + step2[9]); + step1[7] = WRAPLOW(step2[7] + step2[8]); + step1[8] = WRAPLOW(step2[7] - step2[8]); + step1[9] = WRAPLOW(step2[6] - step2[9]); + step1[10] = WRAPLOW(step2[5] - step2[10]); + step1[11] = WRAPLOW(step2[4] - step2[11]); + step1[12] = WRAPLOW(step2[3] - step2[12]); + step1[13] = WRAPLOW(step2[2] - step2[13]); + step1[14] = WRAPLOW(step2[1] - step2[14]); + step1[15] = WRAPLOW(step2[0] - step2[15]); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = WRAPLOW(dct_const_round_shift(temp1)); + step1[27] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = WRAPLOW(dct_const_round_shift(temp1)); + step1[26] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = WRAPLOW(dct_const_round_shift(temp1)); + step1[25] = WRAPLOW(dct_const_round_shift(temp2)); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = WRAPLOW(dct_const_round_shift(temp1)); + step1[24] = WRAPLOW(dct_const_round_shift(temp2)); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = WRAPLOW(step1[0] + step1[31]); + output[1] = WRAPLOW(step1[1] + step1[30]); + output[2] = WRAPLOW(step1[2] + step1[29]); + output[3] = WRAPLOW(step1[3] + step1[28]); + output[4] = WRAPLOW(step1[4] + step1[27]); + output[5] = WRAPLOW(step1[5] + step1[26]); + output[6] = WRAPLOW(step1[6] + step1[25]); + output[7] = WRAPLOW(step1[7] + step1[24]); + output[8] = WRAPLOW(step1[8] + step1[23]); + output[9] = WRAPLOW(step1[9] + step1[22]); + output[10] = WRAPLOW(step1[10] + step1[21]); + output[11] = WRAPLOW(step1[11] + step1[20]); + output[12] = WRAPLOW(step1[12] + step1[19]); + output[13] = WRAPLOW(step1[13] + step1[18]); + output[14] = WRAPLOW(step1[14] + step1[17]); + output[15] = WRAPLOW(step1[15] + step1[16]); + output[16] = WRAPLOW(step1[15] - step1[16]); + output[17] = WRAPLOW(step1[14] - step1[17]); + output[18] = WRAPLOW(step1[13] - step1[18]); + output[19] = WRAPLOW(step1[12] - step1[19]); + output[20] = WRAPLOW(step1[11] - step1[20]); + output[21] = WRAPLOW(step1[10] - step1[21]); + output[22] = WRAPLOW(step1[9] - step1[22]); + output[23] = WRAPLOW(step1[8] - step1[23]); + output[24] = WRAPLOW(step1[7] - step1[24]); + output[25] = WRAPLOW(step1[6] - step1[25]); + output[26] = WRAPLOW(step1[5] - step1[26]); + output[27] = WRAPLOW(step1[4] - step1[27]); + output[28] = WRAPLOW(step1[3] - step1[28]); + output[29] = WRAPLOW(step1[2] - step1[29]); + output[30] = WRAPLOW(step1[1] - step1[30]); + output[31] = WRAPLOW(step1[0] - step1[31]); +} + +void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + for (i = 0; i < 32; ++i) { + int16_t zero_coeff[16]; + for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 8; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 4; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 2; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + + if (zero_coeff[0] | zero_coeff[1]) + aom_idct32_c(input, outptr); + else + memset(outptr, 0, sizeof(tran_low_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + aom_idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) { + aom_idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + aom_idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) { + aom_idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + aom_idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } +} + +void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_high_t a1; + + tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); + a1 = ROUND_POWER_OF_TWO(out, 6); + if (a1 == 0) return; + + for (j = 0; j < 32; ++j) { + for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1); + dest += stride; + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + tran_low_t output[16]; + tran_high_t a1, b1, c1, d1, e1; + const tran_low_t *ip = input; + tran_low_t *op = output; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> UNIT_QUANT_SHIFT; + c1 = ip[1] >> UNIT_QUANT_SHIFT; + d1 = ip[2] >> UNIT_QUANT_SHIFT; + b1 = ip[3] >> UNIT_QUANT_SHIFT; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = HIGHBD_WRAPLOW(b1, bd); + op[2] = HIGHBD_WRAPLOW(c1, bd); + op[3] = HIGHBD_WRAPLOW(d1, bd); + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[stride * 0] = + highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd); + dest[stride * 1] = + highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd); + dest[stride * 2] = + highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd); + dest[stride * 3] = + highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd); + + ip++; + dest++; + } +} + +void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_high_t a1, e1; + tran_low_t tmp[4]; + const tran_low_t *ip = in; + tran_low_t *op = tmp; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + (void)bd; + + a1 = ip[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + op[0] = HIGHBD_WRAPLOW(a1, bd); + op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd); + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = + highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); + dest[dest_stride * 1] = + highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); + dest[dest_stride * 2] = + highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); + dest[dest_stride * 3] = + highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); + ip++; + dest++; + } +} + +void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step[4]; + tran_high_t temp1, temp2; + (void)bd; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 2 + output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); + output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); + output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); + output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); +} + +void aom_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[4], temp_out[4]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // Rows + for (i = 0; i < 4; ++i) { + aom_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + aom_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } +} + +void aom_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, + int dest_stride, int bd) { + int i; + tran_high_t a1; + tran_low_t out = + HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); + dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); + dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); + dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); + dest += dest_stride; + } +} + +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/inv_txfm.h b/third_party/aom/aom_dsp/inv_txfm.h new file mode 100644 index 000000000..e64d463ea --- /dev/null +++ b/third_party/aom/aom_dsp/inv_txfm.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_INV_TXFM_H_ +#define AOM_DSP_INV_TXFM_H_ + +#include + +#include "./aom_config.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +static INLINE tran_high_t dct_const_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return rv; +} + +static INLINE tran_high_t check_range(tran_high_t input, int bd) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid AV1 input streams, intermediate stage coefficients should always + // stay within the range of a signed 16 bit integer. Coefficients can go out + // of this range for invalid/corrupt AV1 streams. However, strictly checking + // this range for every intermediate coefficient can burdensome for a decoder, + // therefore the following assertion is only enabled when configured with + // --enable-coefficient-range-checking. + // For valid highbitdepth AV1 streams, intermediate stage coefficients will + // stay within the ranges: + // - 8 bit: signed 16 bit integer + // - 10 bit: signed 18 bit integer + // - 12 bit: signed 20 bit integer + const int32_t int_max = (1 << (7 + bd)) - 1; + const int32_t int_min = -int_max - 1; + assert(int_min <= input); + assert(input <= int_max); + (void)int_min; +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + (void)bd; + return input; +} + +#define WRAPLOW(x) ((int32_t)check_range(x, 8)) +#if CONFIG_HIGHBITDEPTH +#define HIGHBD_WRAPLOW(x, bd) ((int32_t)check_range((x), bd)) +#endif // CONFIG_HIGHBITDEPTH + +void aom_idct4_c(const tran_low_t *input, tran_low_t *output); +void aom_idct8_c(const tran_low_t *input, tran_low_t *output); +void aom_idct16_c(const tran_low_t *input, tran_low_t *output); +void aom_idct32_c(const tran_low_t *input, tran_low_t *output); +void aom_iadst4_c(const tran_low_t *input, tran_low_t *output); +void aom_iadst8_c(const tran_low_t *input, tran_low_t *output); +void aom_iadst16_c(const tran_low_t *input, tran_low_t *output); + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd); +void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd); +void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd); +void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd); + +void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd); +void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd); +void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd); + +static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, + int bd) { + trans = HIGHBD_WRAPLOW(trans, bd); + return clip_pixel_highbd(dest + (int)trans, bd); +} +#endif + +static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { + trans = WRAPLOW(trans); + return clip_pixel(dest + (int)trans); +} +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_INV_TXFM_H_ diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c new file mode 100644 index 000000000..e2e839219 --- /dev/null +++ b/third_party/aom/aom_dsp/loopfilter.c @@ -0,0 +1,900 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +static INLINE int8_t signed_char_clamp(int t) { + return (int8_t)clamp(t, -128, 127); +} + +#define PARALLEL_DEBLOCKING_11_TAP 0 +#define PARALLEL_DEBLOCKING_9_TAP 0 + +#if CONFIG_HIGHBITDEPTH +static INLINE int16_t signed_char_clamp_high(int t, int bd) { + switch (bd) { + case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1); + case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1); + case 8: + default: return (int16_t)clamp(t, -128, 128 - 1); + } +} +#endif +#if CONFIG_PARALLEL_DEBLOCKING +// should we apply any filter at all: 11111111 yes, 00000000 no +static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, + uint8_t p0, uint8_t q0, uint8_t q1) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} +#endif // CONFIG_PARALLEL_DEBLOCKING +static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, + uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p3 - p2) > limit) * -1; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(q3 - q2) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + mask |= (abs(p3 - p0) > thresh) * -1; + mask |= (abs(q3 - q0) > thresh) * -1; + return ~mask; +} + +#if PARALLEL_DEBLOCKING_9_TAP +static INLINE int8_t flat_mask2(uint8_t thresh, uint8_t p4, uint8_t p0, + uint8_t q0, uint8_t q4) { + int8_t mask = 0; + mask |= (abs(p4 - p0) > thresh) * -1; + mask |= (abs(q4 - q0) > thresh) * -1; + return ~mask; +} +#endif + +#if PARALLEL_DEBLOCKING_11_TAP +static INLINE int8_t flat_mask3(uint8_t thresh, uint8_t p5, uint8_t p4, + uint8_t p0, uint8_t q0, uint8_t q4, + uint8_t q5) { + int8_t mask = 0; + mask |= (abs(p4 - p0) > thresh) * -1; + mask |= (abs(q4 - q0) > thresh) * -1; + mask |= (abs(p5 - p0) > thresh) * -1; + mask |= (abs(q5 - q0) > thresh) * -1; + return ~mask; +} +#endif + +static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3, + uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, + uint8_t q1, uint8_t q2, uint8_t q3, + uint8_t q4) { + int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); + mask |= (abs(p4 - p0) > thresh) * -1; + mask |= (abs(q4 - q0) > thresh) * -1; + return ~mask; +} + +// is there high edge variance internal edge: 11111111 yes, 00000000 no +static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1) { + int8_t hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { + int8_t filter1, filter2; + + const int8_t ps1 = (int8_t)*op1 ^ 0x80; + const int8_t ps0 = (int8_t)*op0 ^ 0x80; + const int8_t qs0 = (int8_t)*oq0 ^ 0x80; + const int8_t qs1 = (int8_t)*oq1 ^ 0x80; + const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); + + // add outer taps if we have high edge variance + int8_t filter = signed_char_clamp(ps1 - qs1) & hev; + + // inner taps + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; + + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way + filter1 = signed_char_clamp(filter + 4) >> 3; + filter2 = signed_char_clamp(filter + 3) >> 3; + + *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80; + *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80; + + // outer tap adjustments + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80; + *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; +} + +void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { +#if !CONFIG_PARALLEL_DEBLOCKING + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); +#else // CONFIG_PARALLEL_DEBLOCKING + const uint8_t p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p]; + const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); +#endif // !CONFIG_PARALLEL_DEBLOCKING + filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } +} + +void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { +#if !CONFIG_PARALLEL_DEBLOCKING + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); +#else // CONFIG_PARALLEL_DEBLOCKING + const uint8_t p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1]; + const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); +#endif // !CONFIG_PARALLEL_DEBLOCKING + filter4(mask, *thresh, s - 2, s - 1, s, s + 1); + s += pitch; + } +} + +void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); +} + +static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t *op3, uint8_t *op2, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3) { + if (flat && mask) { + const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + filter4(mask, thresh, op1, op0, oq0, oq1); + } +} + +void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p); + ++s; + } +} + +void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; + + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, + s + 3); + s += pitch; + } +} + +void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); +} + +#if PARALLEL_DEBLOCKING_11_TAP +static INLINE void filter12(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t flat2, uint8_t *op5, uint8_t *op4, + uint8_t *op3, uint8_t *op2, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3, uint8_t *oq4, + uint8_t *oq5) { + if (flat2 && flat && mask) { + const uint8_t p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, + p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, + q5 = *oq5; + + // 11-tap filter [1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1] + *op4 = (p5 * 5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 6) / 12; + *op3 = (p5 * 4 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 6) / 12; + *op2 = (p5 * 3 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 6) / 12; + *op1 = (p5 * 2 + p4 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 6) / 12; + *op0 = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 6) / 12; + *oq0 = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 6) / 12; + *oq1 = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 + q5 * 2 + 6) / 12; + *oq2 = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 * 3 + 6) / 12; + *oq3 = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 * 4 + 6) / 12; + *oq4 = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 5 + 6) / 12; + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} +#endif + +#if PARALLEL_DEBLOCKING_9_TAP +static INLINE void filter10(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t flat2, uint8_t *op4, uint8_t *op3, + uint8_t *op2, uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, + uint8_t *oq3, uint8_t *oq4) { + if (flat2 && flat && mask) { + const uint8_t p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4; + + // 9-tap filter [1, 1, 1, 1, 2, 1, 1, 1, 1] + *op3 = (p4 * 4 + p3 * 2 + p2 + p1 + p0 + q0 + 5) / 10; + *op2 = (p4 * 3 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + 5) / 10; + *op1 = (p4 * 2 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + 5) / 10; + *op0 = (p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + 5) / 10; + *oq0 = (p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + 5) / 10; + *oq1 = (p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 * 2 + 5) / 10; + *oq2 = (p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 * 3 + 5) / 10; + *oq3 = (p0 + q0 + q1 + q2 + q3 * 2 + q4 * 4 + 5) / 10; + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} +#endif + +static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t flat2, uint8_t *op7, uint8_t *op6, + uint8_t *op5, uint8_t *op4, uint8_t *op3, + uint8_t *op2, uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, + uint8_t *oq3, uint8_t *oq4, uint8_t *oq5, + uint8_t *oq6, uint8_t *oq7) { + if (flat2 && flat && mask) { + const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, + p2 = *op2, p1 = *op1, p0 = *op0; + + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, + q5 = *oq5, q6 = *oq6, q7 = *oq7; + + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + *op6 = ROUND_POWER_OF_TWO( + p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); + *op5 = ROUND_POWER_OF_TWO( + p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO( + p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO( + p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO( + p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4, + 4); + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + + q0 + q1 + q2 + q3 + q4 + q5, + 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + + q1 + q2 + q3 + q4 + q5 + q6, + 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + + q2 + q3 + q4 + q5 + q6 + q7, + 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + + q3 + q4 + q5 + q6 + q7 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO( + p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} + +static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p], + p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p], + p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p], + q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p], q7 = s[7 * p]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + +#if PARALLEL_DEBLOCKING_11_TAP + const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5); + + filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p, + s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, + s + 3 * p, s + 4 * p, s + 5 * p); + +#elif PARALLEL_DEBLOCKING_9_TAP + const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4); + + filter10(mask, *thresh, flat, flat2, s - 5 * p, s - 4 * p, s - 3 * p, + s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, + s + 4 * p); +#else + const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7); + + filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, + s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, + s + 7 * p); +#endif + + ++s; + } +} + +void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); +} + +void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); +} + +static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int count) { + int i; + + for (i = 0; i < count; ++i) { + const uint8_t p7 = s[-8], p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], + p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4], + q5 = s[5], q6 = s[6], q7 = s[7]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + +#if PARALLEL_DEBLOCKING_11_TAP + const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5); + + filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2, + s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5); +#elif PARALLEL_DEBLOCKING_9_TAP + const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4); + + filter10(mask, *thresh, flat, flat2, s - 5, s - 4, s - 3, s - 2, s - 1, s, + s + 1, s + 2, s + 3, s + 4); + +#else + const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7); + + filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, + s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, + s + 7); +#endif + + s += p; + } +} + +void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); +} + +void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16); +} + +#if CONFIG_HIGHBITDEPTH +#if CONFIG_PARALLEL_DEBLOCKING +// Should we apply any filter at all: 11111111 yes, 00000000 no ? +static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, + uint16_t p1, uint16_t p0, uint16_t q0, + uint16_t q1, int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} +#endif // CONFIG_PARALLEL_DEBLOCKING + +// Should we apply any filter at all: 11111111 yes, 00000000 no ? +static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, + uint16_t p3, uint16_t p2, uint16_t p1, + uint16_t p0, uint16_t q0, uint16_t q1, + uint16_t q2, uint16_t q3, int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p3 - p2) > limit16) * -1; + mask |= (abs(p2 - p1) > limit16) * -1; + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(q2 - q1) > limit16) * -1; + mask |= (abs(q3 - q2) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, + uint16_t p1, uint16_t p0, uint16_t q0, + uint16_t q1, uint16_t q2, uint16_t q3, + int bd) { + int8_t mask = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p1 - p0) > thresh16) * -1; + mask |= (abs(q1 - q0) > thresh16) * -1; + mask |= (abs(p2 - p0) > thresh16) * -1; + mask |= (abs(q2 - q0) > thresh16) * -1; + mask |= (abs(p3 - p0) > thresh16) * -1; + mask |= (abs(q3 - q0) > thresh16) * -1; + return ~mask; +} + +static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3, + uint16_t p2, uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, uint16_t q2, + uint16_t q3, uint16_t q4, int bd) { + int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd); + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p4 - p0) > thresh16) * -1; + mask |= (abs(q4 - q0) > thresh16) * -1; + return ~mask; +} + +// Is there high edge variance internal edge: +// 11111111_11111111 yes, 00000000_00000000 no ? +static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, int bd) { + int16_t hev = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + hev |= (abs(p1 - p0) > thresh16) * -1; + hev |= (abs(q1 - q0) > thresh16) * -1; + return hev; +} + +static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + int bd) { + int16_t filter1, filter2; + // ^0x80 equivalent to subtracting 0x80 from the values to turn them + // into -128 to +127 instead of 0 to 255. + int shift = bd - 8; + const int16_t ps1 = (int16_t)*op1 - (0x80 << shift); + const int16_t ps0 = (int16_t)*op0 - (0x80 << shift); + const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift); + const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift); + const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd); + + // Add outer taps if we have high edge variance. + int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev; + + // Inner taps. + filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; + + // Save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way. + filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; + filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; + + *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift); + *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift); + + // Outer tap adjustments. + filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + + *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift); + *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift); +} + +void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { +#if !CONFIG_PARALLEL_DEBLOCKING + const uint16_t p3 = s[-4 * p]; + const uint16_t p2 = s[-3 * p]; + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const uint16_t q2 = s[2 * p]; + const uint16_t q3 = s[3 * p]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); +#else // CONFIG_PARALLEL_DEBLOCKING + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const int8_t mask = + highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); +#endif // !CONFIG_PARALLEL_DEBLOCKING + highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_4_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { +#if !CONFIG_PARALLEL_DEBLOCKING + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); +#else // CONFIG_PARALLEL_DEBLOCKING + const uint16_t p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1]; + const int8_t mask = + highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); +#endif // !CONFIG_PARALLEL_DEBLOCKING + highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); + s += pitch; + } +} + +void aom_highbd_lpf_vertical_4_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, + uint16_t *op3, uint16_t *op2, uint16_t *op1, + uint16_t *op0, uint16_t *oq0, uint16_t *oq1, + uint16_t *oq2, uint16_t *oq3, int bd) { + if (flat && mask) { + const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + // 7-tap filter [1, 1, 1, 2, 1, 1, 1] + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3); + } else { + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + } +} + +void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, + s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_8_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; + + for (i = 0; i < 8; ++i) { + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, + s + 2, s + 3, bd); + s += pitch; + } +} + +void aom_highbd_lpf_vertical_8_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} + +static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, + uint8_t flat2, uint16_t *op7, uint16_t *op6, + uint16_t *op5, uint16_t *op4, uint16_t *op3, + uint16_t *op2, uint16_t *op1, uint16_t *op0, + uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, + uint16_t *oq3, uint16_t *oq4, uint16_t *oq5, + uint16_t *oq6, uint16_t *oq7, int bd) { + if (flat2 && flat && mask) { + const uint16_t p7 = *op7; + const uint16_t p6 = *op6; + const uint16_t p5 = *op5; + const uint16_t p4 = *op4; + const uint16_t p3 = *op3; + const uint16_t p2 = *op2; + const uint16_t p1 = *op1; + const uint16_t p0 = *op0; + const uint16_t q0 = *oq0; + const uint16_t q1 = *oq1; + const uint16_t q2 = *oq2; + const uint16_t q3 = *oq3; + const uint16_t q4 = *oq4; + const uint16_t q5 = *oq5; + const uint16_t q6 = *oq6; + const uint16_t q7 = *oq7; + + // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] + *op6 = ROUND_POWER_OF_TWO( + p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); + *op5 = ROUND_POWER_OF_TWO( + p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO( + p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO( + p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO( + p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4, + 4); + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + + q0 + q1 + q2 + q3 + q4 + q5, + 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + + q1 + q2 + q3 + q4 + q5 + q6, + 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + + q2 + q3 + q4 + q5 + q6 + q7, + 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + + q3 + q4 + q5 + q6 + q7 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO( + p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + } else { + highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + bd); + } +} + +static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count, + int bd) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint16_t p3 = s[-4 * p]; + const uint16_t p2 = s[-3 * p]; + const uint16_t p1 = s[-2 * p]; + const uint16_t p0 = s[-p]; + const uint16_t q0 = s[0 * p]; + const uint16_t q1 = s[1 * p]; + const uint16_t q2 = s[2 * p]; + const uint16_t q3 = s[3 * p]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat2 = + highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, + s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); + + highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, + s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, + s + 6 * p, s + 7 * p, bd); + ++s; + } +} + +void aom_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +} + +void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); +} + +static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int count, + int bd) { + int i; + + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4]; + const uint16_t p2 = s[-3]; + const uint16_t p1 = s[-2]; + const uint16_t p0 = s[-1]; + const uint16_t q0 = s[0]; + const uint16_t q1 = s[1]; + const uint16_t q2 = s[2]; + const uint16_t q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = + highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, + q0, s[4], s[5], s[6], s[7], bd); + + highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, + s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, + s + 5, s + 6, s + 7, bd); + s += p; + } +} + +void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); +} + +void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c new file mode 100644 index 000000000..4c6e201e1 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/add_noise_msa.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./macros_msa.h" + +void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise, + char blackclamp[16], char whiteclamp[16], + char bothclamp[16], uint32_t width, + uint32_t height, int32_t pitch) { + uint32_t i, j; + + for (i = 0; i < height / 2; ++i) { + uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; + int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff)); + uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; + int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff)); + for (j = width / 16; j--;) { + v16i8 temp00_s, temp01_s; + v16u8 temp00, temp01, black_clamp, white_clamp; + v16u8 pos0, ref0, pos1, ref1; + v16i8 const127 = __msa_ldi_b(127); + + pos0 = LD_UB(pos0_ptr); + ref0 = LD_UB(ref0_ptr); + pos1 = LD_UB(pos1_ptr); + ref1 = LD_UB(ref1_ptr); + black_clamp = (v16u8)__msa_fill_b(blackclamp[0]); + white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]); + temp00 = (pos0 < black_clamp); + pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); + temp01 = (pos1 < black_clamp); + pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); + XORI_B2_128_UB(pos0, pos1); + temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); + temp00 = (v16u8)(temp00_s < pos0); + pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); + temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); + temp01 = (temp01_s < pos1); + pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); + XORI_B2_128_UB(pos0, pos1); + pos0 += ref0; + ST_UB(pos0, pos0_ptr); + pos1 += ref1; + ST_UB(pos1, pos1_ptr); + pos0_ptr += 16; + pos1_ptr += 16; + ref0_ptr += 16; + ref1_ptr += 16; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c new file mode 100644 index 000000000..847394a3d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c @@ -0,0 +1,704 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, res2, res3; + v16u8 mask0, mask1, mask2, mask3; + v8i16 filt, res0, res1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, res0, res1); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SRARI_H2_SH(res0, res1, FILTER_BITS); + SAT_SH2_SH(res0, res1, 7); + PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + XORI_B2_128_UB(res2, res3); + AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8i16 filt, vec0, vec1, vec2, vec3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, vec0, vec1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, vec2, vec3); + SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS); + SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); + PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2, + res3); + ILVR_D2_UB(res1, res0, res3, res2, res0, res2); + XORI_B2_128_UB(res0, res2); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); + AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); + ST4x8_UB(res0, res2, dst, dst_stride); +} + +static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, dst0, dst1; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height >> 1; loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + src += (2 * src_stride); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, + vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + LD_UB2(dst, dst_stride, dst0, dst1); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); + dst += dst_stride; + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, + vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(dst, 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 dst1, dst2, mask0, mask1, mask2, mask3; + v8i16 filt, out0, out1, out2, out3; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + for (cnt = 0; cnt < 2; ++cnt) { + src0 = LD_SB(&src[cnt << 5]); + src2 = LD_SB(&src[16 + (cnt << 5)]); + src3 = LD_SB(&src[24 + (cnt << 5)]); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, + vec12); + VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, + vec13); + VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, + vec14); + VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, + vec15); + DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, + vec1, vec2, vec3); + DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, + vec9, vec10, vec11); + DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, + vec1, vec2, vec3); + DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, + vec9, vec10, vec11); + ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + LD_UB2(&dst[cnt << 5], 16, dst1, dst2); + PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); + PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); + } + + src += src_stride; + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); +} + +static void common_hz_2t_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + LD_SB4(src, src_stride, src0, src1, src2, src3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, + dst_stride); + } +} + +static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + + for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res3, res2, dst1, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + dst += dst_stride; + PCKEV_AVG_ST_UB(res7, res6, dst3, dst); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, + res2, res3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, + res6, res7); + SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); + SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); + LD_UB2(dst, 16, dst0, dst1); + PCKEV_AVG_ST_UB(res1, res0, dst0, dst); + PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); + dst += dst_stride; + LD_UB2(dst, 16, dst2, dst3); + PCKEV_AVG_ST_UB(res5, res4, dst2, dst); + PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); + dst += dst_stride; + } +} + +static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, dst0, dst1, dst2, dst3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + LD_SB4(src, 16, src0, src2, src4, src6); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + LD_UB4(dst, 16, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST_UB(out1, out0, dst0, dst); + PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); + PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); + PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); + dst += dst_stride; + } +} + +void aom_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0) { + switch (w) { + case 4: + common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 8: + common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 16: + common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 32: + common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + case 64: + common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], h); + break; + default: + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 8: + common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 16: + common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 32: + common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + case 64: + common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, h); + break; + default: + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c new file mode 100644 index 000000000..bed600d5b --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c @@ -0,0 +1,605 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_hv_8ht_8vt_and_aver_dst_4w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); + vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + + SRARI_H2_SH(res0, res1, FILTER_BITS); + SAT_SH2_SH(res0, res1, 7); + PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); + XORI_B2_128_UB(tmp0, tmp1); + AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + vec0 = vec2; + vec1 = vec3; + vec2 = vec4; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_8w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_16w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_32w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_and_aver_dst_64w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 dst0, dst1, dst2, dst3, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, + dst6); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, + tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2, + res3); + AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_4w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); +} + +static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_8w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert); + } else { + common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( + src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_and_aver_dst_16w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_32w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +static void common_hv_2ht_2vt_and_aver_dst_64w_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + src += 16; + dst += 16; + } +} + +void aom_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0 && + ((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 8: + common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], h); + break; + case 16: + common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 32: + common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + case 64: + common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, + &filt_hor[3], &filt_ver[3], h); + break; + default: + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else if (((const int32_t *)filter_x)[0] == 0 || + ((const int32_t *)filter_y)[0] == 0) { + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 8: + common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 16: + common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 32: + common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + case 64: + common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, + filt_ver, h); + break; + default: + aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c new file mode 100644 index 000000000..dae771104 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c @@ -0,0 +1,677 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3, out; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); + + dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 dst0, dst1, dst2, dst3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1, out2, out3; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1, + filt2, filt3); + out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1, + filt2, filt3); + out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1, + filt2, filt3); + out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, + dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_and_aver_dst_16w_mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, int32_t width) { + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16i8 filt0, filt1, filt2, filt3; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + src_tmp += (4 * src_stride); + + LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3); + XORI_B4_128_SB(src7, src8, src9, src10); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1, + dst2, dst3); + ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 16); +} + +static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 32); +} + +static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, + filter, height, 64); +} + +static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; + v16i8 src10_r, src32_r, src21_r, src43_r; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + src4 = LD_SB(src); + src += src_stride; + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, dst0); + + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16u8 src2110, src4332, src6554, src8776, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2, + dst3); + ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter) { + v16u8 src0, src1, src2, src3, src4; + v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, + dst_stride); +} + +static void common_vt_2t_and_aver_dst_8x8mult_msa( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst, + dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst, + dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int8_t *filter, + int32_t height) { + if (4 == height) { + common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, + filter, height); + } +} + +static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_UB2(src, 16, src0, src5); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5; + v16u8 src6, src7, src8, src9, src10, src11, filt0; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8u16 filt; + + /* rearranging filter_y */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(dst, dst_stride, dst0, dst1); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(dst + 16, dst_stride, dst2, dst3); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(dst + 32, dst_stride, dst4, dst5); + LD_UB2(src + 48, src_stride, src10, src11); + LD_UB2(dst + 48, dst_stride, dst6, dst7); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48)); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void aom_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 8: + common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 16: + common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 32: + common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + case 64: + common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_ver[3], h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 8: + common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 16: + common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + + break; + case 32: + common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + case 64: + common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_ver, h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c new file mode 100644 index 000000000..fc3a823c5 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c @@ -0,0 +1,692 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 mask0, mask1, mask2, mask3, out; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v8i16 filt, out0, out1; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1); + SRARI_H2_SH(out0, out1, FILTER_BITS); + SAT_SH2_SH(out0, out1, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 filt0, filt1, filt2, filt3; + v16i8 src0, src1, src2, src3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1); + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + out = PCKEV_XORI128_UB(out2, out3); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, + filt0, filt1, filt2, filt3, out0, out1, out2, + out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); +} + +static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (4 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + tmp0 = PCKEV_XORI128_UB(out0, out1); + tmp1 = PCKEV_XORI128_UB(out2, out3); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_SB2(src, src_stride, src0, src2); + LD_SB2(src + 8, src_stride, src1, src3); + XORI_B4_128_SB(src0, src1, src2, src3); + src += (2 * src_stride); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + dst += dst_stride; + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst); + dst += dst_stride; + } +} + +static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + int32_t loop_cnt; + v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 filt, out0, out1, out2, out3; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= 3; + + /* rearranging filter */ + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 16); + + src0 = LD_SB(src + 32); + src2 = LD_SB(src + 48); + src3 = LD_SB(src + 56); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + + XORI_B4_128_SB(src0, src1, src2, src3); + HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, + mask3, filt0, filt1, filt2, filt3, out0, out1, + out2, out3); + SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); + SAT_SH4_SH(out0, out1, out2, out3, 7); + out = PCKEV_XORI128_UB(out0, out1); + ST_UB(out, dst + 32); + out = PCKEV_XORI128_UB(out2, out3); + ST_UB(out, dst + 48); + dst += dst_stride; + } +} + +static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, mask; + v16u8 filt0, vec0, vec1, res0, res1; + v8u16 vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); + SRARI_H2_UH(vec2, vec3, FILTER_BITS); + PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 vec0, vec1, vec2, vec3, filt0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16i8 res0, res1, res2, res3; + v8u16 vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); + VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, + vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); + ST8x4_UB(src0, src1, dst, dst_stride); +} + +static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + v16u8 filt0; + v16i8 src0, src1, src2, src3, mask, out0, out1; + v8u16 vec0, vec1, vec2, vec3, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + if (16 == height) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); + ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); + } +} + +static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + loop_cnt = (height >> 2) - 1; + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + + for (; loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + dst += dst_stride; + PCKEV_ST_SB(out2, out3, dst); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + dst += dst_stride; + PCKEV_ST_SB(out6, out7, dst); + dst += dst_stride; + } +} + +static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height >> 1; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src3 = LD_SB(src + 24); + src1 = __msa_sldi_b(src2, src0, 8); + src += src_stride; + src4 = LD_SB(src); + src6 = LD_SB(src + 16); + src7 = LD_SB(src + 24); + src5 = __msa_sldi_b(src6, src4, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + dst += dst_stride; + PCKEV_ST_SB(out4, out5, dst); + PCKEV_ST_SB(out6, out7, dst + 16); + dst += dst_stride; + } +} + +static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_UH(filter); + filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); + + for (loop_cnt = height; loop_cnt--;) { + src0 = LD_SB(src); + src2 = LD_SB(src + 16); + src4 = LD_SB(src + 32); + src6 = LD_SB(src + 48); + src7 = LD_SB(src + 56); + SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); + src += src_stride; + + VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_ST_SB(out0, out1, dst); + PCKEV_ST_SB(out2, out3, dst + 16); + PCKEV_ST_SB(out4, out5, dst + 32); + PCKEV_ST_SB(out6, out7, dst + 48); + dst += dst_stride; + } +} + +void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_hor[8]; + + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0) { + switch (w) { + case 4: + common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 8: + common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 16: + common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 32: + common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + case 64: + common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_hor[3], h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 8: + common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 16: + common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 32: + common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + case 64: + common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_hor, h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c new file mode 100644 index 000000000..a4d594931 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c @@ -0,0 +1,630 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +const uint8_t mc_filt_mask_arr[16 * 3] = { + /* 8 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + /* 4 width cases */ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + /* 4 width cases */ + 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 +}; + +static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, out; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + + mask0 = LD_UB(&mc_filt_mask_arr[16]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); + out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H2_SH(tmp0, tmp1, FILTER_BITS); + SAT_SH2_SH(tmp0, tmp1, 7); + out = PCKEV_XORI128_UB(tmp0, tmp1); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out5 = hz_out9; + out0 = out2; + out1 = out3; + out2 = out4; + } +} + +static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; + v16u8 mask0, mask1, mask2, mask3, vec0, vec1; + v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; + + mask0 = LD_UB(&mc_filt_mask_arr[0]); + src -= (3 + 3 * src_stride); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); + + mask1 = mask0 + 2; + mask2 = mask0 + 4; + mask3 = mask0 + 6; + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + + filt = LD_SH(filter_vert); + SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); + + ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); + ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); + ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + XORI_B4_128_SB(src7, src8, src9, src10); + + hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); + tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); + tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, + filt_hz1, filt_hz2, filt_hz3); + out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); + tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + + hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, + filt_hz0, filt_hz1, filt_hz2, filt_hz3); + out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); + tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, + filt_vt2, filt_vt3); + SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); + vec0 = PCKEV_XORI128_UB(tmp0, tmp1); + vec1 = PCKEV_XORI128_UB(tmp2, tmp3); + ST8x4_UB(vec0, vec1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out6 = hz_out10; + out0 = out2; + out1 = out3; + out2 = out8; + out4 = out6; + out5 = out7; + out6 = out9; + } +} + +static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 8; multiple8_cnt--;) { + common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 8; + dst += 8; + } +} + +static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask; + v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; + v16i8 res0, res1, res2, res3; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; + v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; + + mask = LD_SB(&mc_filt_mask_arr[16]); + + /* rearranging filter */ + filt = LD_UH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); + + filt = LD_UH(filter_vert); + filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + src8 = LD_SB(src); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); + hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); + hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); + SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, + hz_out3, hz_out5, 8); + hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); + + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, + vec5, vec6, vec7); + SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); + PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, + res3); + ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); + ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else if (8 == height) { + common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } +} + +static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert) { + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec1, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec2, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec3, filt_vt); + + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + int8_t *filter_horiz, + int8_t *filter_vert, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, mask, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_SB(src); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + LD_SB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp4 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); + PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp5 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp6 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp7 = __msa_dotp_u_h(vec0, filt_vt); + + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp8 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); + PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + if (4 == height) { + common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert); + } else { + common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height); + } +} + +static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; + v16u8 filt_hz, filt_vt, vec0, vec1; + v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 filt; + + mask = LD_SB(&mc_filt_mask_arr[0]); + + /* rearranging filter */ + filt = LD_SH(filter_horiz); + filt_hz = (v16u8)__msa_splati_h(filt, 0); + + filt = LD_SH(filter_vert); + filt_vt = (v16u8)__msa_splati_h(filt, 0); + + LD_SB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); + SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); + PCKEV_ST_SB(tmp1, tmp2, dst); + dst += dst_stride; + } +} + +static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 2; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter_horiz, int8_t *filter_vert, + int32_t height) { + int32_t multiple8_cnt; + for (multiple8_cnt = 4; multiple8_cnt--;) { + common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, + filter_vert, height); + src += 16; + dst += 16; + } +} + +void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int32_t x_step_q4, const int16_t *filter_y, + int32_t y_step_q4, int32_t w, int32_t h) { + int8_t cnt, filt_hor[8], filt_ver[8]; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 0; cnt < 8; ++cnt) { + filt_hor[cnt] = filter_x[cnt]; + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_x)[0] == 0 && + ((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 8: + common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 16: + common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 32: + common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + case 64: + common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, &filt_hor[3], + &filt_ver[3], (int32_t)h); + break; + default: + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + break; + } + } else if (((const int32_t *)filter_x)[0] == 0 || + ((const int32_t *)filter_y)[0] == 0) { + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + } else { + switch (w) { + case 4: + common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 8: + common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 16: + common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 32: + common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + case 64: + common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filt_hor, filt_ver, + (int32_t)h); + break; + default: + aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, + filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c new file mode 100644 index 000000000..f7bdfc2bd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/aom_convolve_msa.h" + +static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; + v16i8 src10998, filt0, filt1, filt2, filt3; + v16u8 out; + v8i16 filt, out10, out32; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, + src4332, src6554); + XORI_B3_128_SB(src2110, src4332, src6554); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); + XORI_B2_128_SB(src8776, src10998); + out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, + filt1, filt2, filt3); + out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, + filt1, filt2, filt3); + SRARI_H2_SH(out10, out32, FILTER_BITS); + SAT_SH2_SH(out10, out32, 7); + out = PCKEV_XORI128_UB(out10, out32); + ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + dst += (4 * dst_stride); + + src2110 = src6554; + src4332 = src8776; + src6554 = src10998; + src6 = src10; + } +} + +static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; + v16u8 tmp0, tmp1; + v8i16 filt, out0_r, out1_r, out2_r, out3_r; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); + tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src6 = src10; + } +} + +static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src += (4 * src_stride); + + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r, + tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } +} + +static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height, + int32_t width) { + const uint8_t *src_tmp; + uint8_t *dst_tmp; + uint32_t loop_cnt, cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16i8 filt0, filt1, filt2, filt3; + v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; + v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + + src -= (3 * src_stride); + + filt = LD_SH(filter); + SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); + XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); + src_tmp += (7 * src_stride); + ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, + src54_r, src21_r); + ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); + ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, + src54_l, src21_l); + ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); + XORI_B4_128_SB(src7, src8, src9, src10); + src_tmp += (4 * src_stride); + ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, + src87_r, src98_r, src109_r); + ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, + src87_l, src98_l, src109_l); + out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, + filt1, filt2, filt3); + out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, + filt1, filt2, filt3); + out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, + filt1, filt2, filt3); + out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, + filt1, filt2, filt3); + out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, + filt1, filt2, filt3); + out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, + filt1, filt2, filt3); + out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, + filt1, filt2, filt3); + out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, + filt1, filt2, filt3); + SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); + SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); + SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); + SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); + PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, tmp0, tmp1, tmp2, tmp3); + XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride); + dst_tmp += (4 * dst_stride); + + src10_r = src54_r; + src32_r = src76_r; + src54_r = src98_r; + src21_r = src65_r; + src43_r = src87_r; + src65_r = src109_r; + src10_l = src54_l; + src32_l = src76_l; + src54_l = src98_l; + src21_l = src65_l; + src43_l = src87_l; + src65_l = src109_l; + src6 = src10; + } + + src += 16; + dst += 16; + } +} + +static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 32); +} + +static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height, + 64); +} + +static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4; + v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332; + v16u8 filt0; + v8i16 filt; + v8u16 tmp0, tmp1; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB5(src, src_stride, src0, src1, src2, src3, src4); + src += (5 * src_stride); + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); +} + +static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r; + v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 filt; + + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + src8 = LD_SB(src); + src += src_stride; + + ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, + src76_r, src87_r); + ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, + src76_r, src2110, src4332, src6554, src8776); + DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, + tmp0, tmp1, tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); + ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); +} + +static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter); + } else if (8 == height) { + common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter); + } +} + +static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter) { + v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB5(src, src_stride, src0, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); + ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); +} + +static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v16i8 out0, out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); + src += (8 * src_stride); + + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, + vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); + ST8x4_UB(out0, out1, dst, dst_stride); + dst += (4 * dst_stride); + + src0 = src8; + } +} + +static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + if (4 == height) { + common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter); + } else { + common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height); + } +} + +static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + dst += dst_stride; + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst); + dst += dst_stride; + + src0 = src4; + } +} + +static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + src0 = LD_UB(src); + src5 = LD_UB(src + 16); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + + LD_UB4(src + 16, src_stride, src6, src7, src8, src9); + src += (4 * src_stride); + + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride); + + ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); + ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 16); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride); + + ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); + ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride); + dst += (4 * dst_stride); + + src0 = src4; + src5 = src9; + } +} + +static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int8_t *filter, int32_t height) { + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; + v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 filt; + + /* rearranging filter_y */ + filt = LD_SH(filter); + filt0 = (v16u8)__msa_splati_h(filt, 0); + + LD_UB4(src, 16, src0, src3, src6, src9); + src += src_stride; + + for (loop_cnt = (height >> 1); loop_cnt--;) { + LD_UB2(src, src_stride, src1, src2); + LD_UB2(src + 16, src_stride, src4, src5); + LD_UB2(src + 32, src_stride, src7, src8); + LD_UB2(src + 48, src_stride, src10, src11); + src += (2 * src_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride); + + ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); + ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_ST_SB(tmp4, tmp5, dst + 16); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride); + + ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); + ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + PCKEV_ST_SB(tmp0, tmp1, dst + 32); + + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride); + + ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); + ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); + SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); + PCKEV_ST_SB(tmp4, tmp5, dst + 48); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); + SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); + PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride); + dst += (2 * dst_stride); + + src0 = src2; + src3 = src5; + src6 = src8; + src9 = src11; + } +} + +void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + int8_t cnt, filt_ver[8]; + + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + for (cnt = 8; cnt--;) { + filt_ver[cnt] = filter_y[cnt]; + } + + if (((const int32_t *)filter_y)[0] == 0) { + switch (w) { + case 4: + common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 8: + common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 16: + common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 32: + common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + case 64: + common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + &filt_ver[3], h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } else { + switch (w) { + case 4: + common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 8: + common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 16: + common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 32: + common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + case 64: + common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride, + filt_ver, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c new file mode 100644 index 000000000..75f8c7ea8 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/macros_msa.h" + +static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + uint32_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + if (0 == (height % 4)) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + + out0 = __msa_copy_u_w((v4i32)dst0, 0); + out1 = __msa_copy_u_w((v4i32)dst1, 0); + out2 = __msa_copy_u_w((v4i32)dst2, 0); + out3 = __msa_copy_u_w((v4i32)dst3, 0); + SW4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == (height % 2)) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + + LD_UB2(dst, dst_stride, dst0, dst1); + + AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); + + out0 = __msa_copy_u_w((v4i32)dst0, 0); + out1 = __msa_copy_u_w((v4i32)dst1, 0); + SW(out0, dst); + dst += dst_stride; + SW(out1, dst); + dst += dst_stride; + } + } +} + +static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, + int32_t dst_stride, int32_t height) { + int32_t cnt; + uint64_t out0, out1, out2, out3; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + + out0 = __msa_copy_u_d((v2i64)dst0, 0); + out1 = __msa_copy_u_d((v2i64)dst1, 0); + out2 = __msa_copy_u_d((v2i64)dst2, 0); + out3 = __msa_copy_u_d((v2i64)dst3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + for (cnt = (height / 8); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); + dst += (8 * dst_stride); + } +} + +static void avg_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *dst_dup = dst; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (cnt = (height / 8); cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6); + LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7); + dst_dup += (4 * dst_stride); + LD_UB4(src, src_stride, src8, src10, src12, src14); + LD_UB4(src + 16, src_stride, src9, src11, src13, src15); + src += (4 * src_stride); + LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14); + LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); + dst_dup += (4 * dst_stride); + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, + dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, + dst13, dst14, dst15); + + ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); + ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); + dst += (4 * dst_stride); + ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride); + ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride); + dst += (4 * dst_stride); + } +} + +static void avg_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint8_t *dst_dup = dst; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 src8, src9, src10, src11, src12, src13, src14, src15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(src, 16, src4, src5, src6, src7); + src += src_stride; + LD_UB4(src, 16, src8, src9, src10, src11); + src += src_stride; + LD_UB4(src, 16, src12, src13, src14, src15); + src += src_stride; + + LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11); + dst_dup += dst_stride; + LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); + dst_dup += dst_stride; + + AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, + dst2, dst3); + AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, + dst6, dst7); + AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, + dst10, dst11); + AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, + dst13, dst14, dst15); + + ST_UB4(dst0, dst1, dst2, dst3, dst, 16); + dst += dst_stride; + ST_UB4(dst4, dst5, dst6, dst7, dst, 16); + dst += dst_stride; + ST_UB4(dst8, dst9, dst10, dst11, dst, 16); + dst += dst_stride; + ST_UB4(dst12, dst13, dst14, dst15, dst, 16); + dst += dst_stride; + } +} + +void aom_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int32_t filter_x_stride, + const int16_t *filter_y, int32_t filter_y_stride, + int32_t w, int32_t h) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + + switch (w) { + case 4: { + avg_width4_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 8: { + avg_width8_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + avg_width16_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + avg_width32_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + avg_width64_msa(src, src_stride, dst, dst_stride, h); + break; + } + default: { + int32_t lp, cnt; + for (cnt = h; cnt--;) { + for (lp = 0; lp < w; ++lp) { + dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1); + } + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c new file mode 100644 index 000000000..f7f116f4d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "aom_dsp/mips/macros_msa.h" + +static void copy_width8_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + out4 = __msa_copy_u_d((v2i64)src4, 0); + out5 = __msa_copy_u_d((v2i64)src5, 0); + out6 = __msa_copy_u_d((v2i64)src6, 0); + out7 = __msa_copy_u_d((v2i64)src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + for (cnt = height >> 3; cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + out4 = __msa_copy_u_d((v2i64)src4, 0); + out5 = __msa_copy_u_d((v2i64)src5, 0); + out6 = __msa_copy_u_d((v2i64)src6, 0); + out7 = __msa_copy_u_d((v2i64)src7, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 4) { + for (cnt = (height / 4); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + out2 = __msa_copy_u_d((v2i64)src2, 0); + out3 = __msa_copy_u_d((v2i64)src3, 0); + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 2) { + for (cnt = (height / 2); cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + out0 = __msa_copy_u_d((v2i64)src0, 0); + out1 = __msa_copy_u_d((v2i64)src1, 0); + + SD(out0, dst); + dst += dst_stride; + SD(out1, dst); + dst += dst_stride; + } + } +} + +static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + int32_t height, int32_t width) { + int32_t cnt, loop_cnt; + const uint8_t *src_tmp; + uint8_t *dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for (cnt = (width >> 4); cnt--;) { + src_tmp = src; + dst_tmp = dst; + + for (loop_cnt = (height >> 3); loop_cnt--;) { + LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + src_tmp += (8 * src_stride); + + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp, + dst_stride); + dst_tmp += (8 * dst_stride); + } + + src += 16; + dst += 16; + } +} + +static void copy_width16_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride); + dst += (8 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width32_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + int32_t cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if (0 == height % 12) { + for (cnt = (height / 12); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } else if (0 == height % 8) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32); + } else if (0 == height % 4) { + for (cnt = (height >> 2); cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + LD_UB4(src + 16, src_stride, src4, src5, src6, src7); + src += (4 * src_stride); + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride); + dst += (4 * dst_stride); + } + } +} + +static void copy_width64_msa(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, int32_t height) { + copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64); +} + +void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int32_t filter_x_stride, + const int16_t *filter_y, int32_t filter_y_stride, + int32_t w, int32_t h) { + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + + switch (w) { + case 4: { + uint32_t cnt, tmp; + /* 1 word storage */ + for (cnt = h; cnt--;) { + tmp = LW(src); + SW(tmp, dst); + src += src_stride; + dst += dst_stride; + } + break; + } + case 8: { + copy_width8_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 16: { + copy_width16_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 32: { + copy_width32_msa(src, src_stride, dst, dst_stride, h); + break; + } + case 64: { + copy_width64_msa(src, src_stride, dst, dst_stride, h); + break; + } + default: { + uint32_t cnt; + for (cnt = h; cnt--;) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + break; + } + } +} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h new file mode 100644 index 000000000..1a0ae4d8d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ +#define AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/aom_filter.h" + +extern const uint8_t mc_filt_mask_arr[16 * 3]; + +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ + filt3) \ + ({ \ + v8i16 tmp_dpadd_0, tmp_dpadd_1; \ + \ + tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ + tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ + tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ + tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ + \ + tmp_dpadd_0; \ + }) + +#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ + filt_h1, filt_h2, filt_h3) \ + ({ \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ + v8i16 hz_out_m; \ + \ + VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \ + vec3_m); \ + hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \ + filt_h1, filt_h2, filt_h3); \ + \ + hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ + hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ + \ + hz_out_m; \ + }) + +#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ + DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ + DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ + DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ + VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ + ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ + } + +#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ + mask2, mask3, filt0, filt1, filt2, filt3, \ + out0, out1, out2, out3) \ + { \ + v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ + \ + VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ + DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ + res4_m, res5_m, res6_m, res7_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ + res0_m, res1_m, res2_m, res3_m); \ + VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ + VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ + DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ + res4_m, res5_m, res6_m, res7_m); \ + ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ + res7_m, out0, out1, out2, out3); \ + } + +#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ + { \ + v16u8 tmp_m; \ + \ + tmp_m = PCKEV_XORI128_UB(in1, in0); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ + } + +#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ + { \ + v16u8 tmp_m; \ + \ + tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ + ST_UB(tmp_m, (pdst)); \ + } + +#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \ + stride) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ + PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ + } +#endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/avg_msa.c b/third_party/aom/aom_dsp/mips/avg_msa.c new file mode 100644 index 000000000..0e1728155 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/avg_msa.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +uint32_t aom_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; + v4u32 sum = { 0 }; + + LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); + HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); + ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); + ADD2(sum0, sum2, sum4, sum6, sum0, sum4); + sum0 += sum4; + + sum = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); + sum = __msa_hadd_u_w(sum0, sum0); + sum = (v4u32)__msa_srari_w((v4i32)sum, 6); + sum_out = __msa_copy_u_w((v4i32)sum, 0); + + return sum_out; +} + +uint32_t aom_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { + uint32_t sum_out; + uint32_t src0, src1, src2, src3; + v16u8 vec = { 0 }; + v8u16 sum0; + v4u32 sum1; + v2u64 sum2; + + LW4(src, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, vec); + + sum0 = __msa_hadd_u_h(vec, vec); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); + sum1 = __msa_hadd_u_w(sum0, sum0); + sum2 = __msa_hadd_u_d(sum1, sum1); + sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); + sum_out = __msa_copy_u_w((v4i32)sum1, 0); + + return sum_out; +} diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.c b/third_party/aom/aom_dsp/mips/common_dspr2.c new file mode 100644 index 000000000..00ab75dc3 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/common_dspr2.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH]; +uint8_t *aom_ff_cropTbl; + +void aom_dsputil_static_init(void) { + int i; + + for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i; + + for (i = 0; i < CROP_WIDTH; i++) { + aom_ff_cropTbl_a[i] = 0; + aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255; + } + + aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH]; +} + +#endif diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h new file mode 100644 index 000000000..31159fdcd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/common_dspr2.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_COMMON_MIPS_DSPR2_H_ +#define AOM_COMMON_MIPS_DSPR2_H_ + +#include +#include "./aom_config.h" +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif +#if HAVE_DSPR2 +#define CROP_WIDTH 512 + +extern uint8_t *aom_ff_cropTbl; // From "aom_dsp/mips/intrapred4_dspr2.c" + +static INLINE void prefetch_load(const unsigned char *src) { + __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +static INLINE void prefetch_store(unsigned char *dst) { + __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} + +static INLINE void prefetch_load_streamed(const unsigned char *src) { + __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src)); +} + +/* prefetch data for store */ +static INLINE void prefetch_store_streamed(unsigned char *dst) { + __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst)); +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_COMMON_MIPS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c new file mode 100644 index 000000000..d557115b9 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(y_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, + w, h); + break; + case 64: + prefetch_store(dst + 32); + convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, + h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c new file mode 100644 index 000000000..efbdcf60f --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3; + uint32_t tn1, tn2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p3], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3, tp4; + uint32_t p1, p2, p3, p4, n1; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tp4], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tp4], %[tp4], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tp4], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tp3], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp4], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tp1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tp3], %[tp3], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tp4], %[tp4], %[p2] \n\t" + + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[p1] \n\t" + + /* store bytes */ + "sb %[tp3], 3(%[dst]) \n\t" + "sb %[tp4], 5(%[dst]) \n\t" + "sb %[tp1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + uint32_t pos = 38; + + assert(x_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 8: + convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 16: + convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 1); + break; + case 32: + convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + default: + aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c new file mode 100644 index 000000000..066308315 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c @@ -0,0 +1,1030 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + int32_t Temp1, Temp2; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp2](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_stride] "r"(dst_stride)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_8_transposed_dspr2( + const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[Temp1], %[p3](%[cm]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[Temp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr), + [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +static void convolve_bi_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "ulw %[qload1], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload1] " + "\n\t" + "ulw %[qload2], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} + +void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + sum += src[x] * filter[3]; + sum += src[x + 1] * filter[4]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h) { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + case 8: + convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + case 16: + case 32: + convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h, (w / 16)); + break; + case 64: + prefetch_load(src + 32); + convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride, + filter, h); + break; + default: + convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w, + h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c new file mode 100644 index 000000000..dc51ab1cb --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c @@ -0,0 +1,681 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[p1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[p2], 3(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [Temp4] "=&r"(Temp4) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint32_t st0, st1; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[p1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), + [dst] "r"(dst), [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(x_step_q4 == 16); + + prefetch_load((const uint8_t *)filter_x); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 8: + convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 16: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); + break; + case 32: + convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + default: + aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c new file mode 100644 index 000000000..3367be01a --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), + [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [filter45] "r"(filter45), [vector4a] "r"(vector4a), + [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + uint32_t pos = 38; + + assert(y_step_q4 == 16); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); + break; + case 64: + prefetch_store(dst + 32); + convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c new file mode 100644 index 000000000..298065adb --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c @@ -0,0 +1,641 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "lbu %[scratch1], 0(%[dst_ptr]) \n\t" + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + "lbu %[scratch2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ + "extp %[Temp2], $ac3, 31 \n\t" + "lbu %[scratch1], 2(%[dst_ptr]) \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + "lbu %[scratch2], 3(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ + "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + if (((const int32_t *)filter_y)[0] == 0) { + aom_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, + h); + break; + case 64: + prefetch_store(dst + 32); + convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, + h); + break; + default: + aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} + +void aom_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + + assert(w <= 64); + assert(h <= 64); + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + + if (intermediate_height < h) intermediate_height = h; + + aom_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x, + x_step_q4, filter_y, y_step_q4, w, intermediate_height); + + aom_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); +} + +void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, int w, + int h) { + int x, y; + uint32_t tp1, tp2, tn1; + uint32_t tp3, tp4, tn2; + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + /* 1 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + + : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 8: + /* 2 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 16: + /* 4 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 32: + /* 8 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_load(src + src_stride + 64); + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 0(%[dst]) \n\t" + "ulw %[tp3], 4(%[src]) \n\t" + "ulw %[tp4], 4(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 8(%[src]) \n\t" + "ulw %[tp2], 8(%[dst]) \n\t" + "sw %[tn1], 0(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 4(%[dst]) \n\t" /* store */ + "ulw %[tp3], 12(%[src]) \n\t" + "ulw %[tp4], 12(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 16(%[src]) \n\t" + "ulw %[tp2], 16(%[dst]) \n\t" + "sw %[tn1], 8(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 12(%[dst]) \n\t" /* store */ + "ulw %[tp3], 20(%[src]) \n\t" + "ulw %[tp4], 20(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 24(%[src]) \n\t" + "ulw %[tp2], 24(%[dst]) \n\t" + "sw %[tn1], 16(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 20(%[dst]) \n\t" /* store */ + "ulw %[tp3], 28(%[src]) \n\t" + "ulw %[tp4], 28(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 32(%[dst]) \n\t" + "sw %[tn1], 24(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 28(%[dst]) \n\t" /* store */ + "ulw %[tp3], 36(%[src]) \n\t" + "ulw %[tp4], 36(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 40(%[src]) \n\t" + "ulw %[tp2], 40(%[dst]) \n\t" + "sw %[tn1], 32(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 36(%[dst]) \n\t" /* store */ + "ulw %[tp3], 44(%[src]) \n\t" + "ulw %[tp4], 44(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 48(%[src]) \n\t" + "ulw %[tp2], 48(%[dst]) \n\t" + "sw %[tn1], 40(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 44(%[dst]) \n\t" /* store */ + "ulw %[tp3], 52(%[src]) \n\t" + "ulw %[tp4], 52(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "ulw %[tp1], 56(%[src]) \n\t" + "ulw %[tp2], 56(%[dst]) \n\t" + "sw %[tn1], 48(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 52(%[dst]) \n\t" /* store */ + "ulw %[tp3], 60(%[src]) \n\t" + "ulw %[tp4], 60(%[dst]) \n\t" + "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ + "sw %[tn1], 56(%[dst]) \n\t" /* store */ + "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ + "sw %[tn2], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + break; + default: + for (y = h; y > 0; --y) { + for (x = 0; x < w; ++x) { + dst[x] = (dst[x] + src[x] + 1) >> 1; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c new file mode 100644 index 000000000..f6534b420 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c @@ -0,0 +1,998 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ + "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ + "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ + + /* clamp */ + "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ + "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ + "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ + + "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ + "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ + + "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ + "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + "lbu %[Temp2], 0(%[dst]) \n\t" + "lbu %[tn3], 2(%[dst]) \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" + "sb %[Temp2], 0(%[dst]) \n\t" + "sb %[tn3], 2(%[dst]) \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "lbu %[Temp2], 4(%[dst]) \n\t" + "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[Temp2], 4(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tp1], 6(%[dst]) \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + "lbu %[tp2], 1(%[dst]) \n\t" + "lbu %[tn2], 3(%[dst]) \n\t" + "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "lbu %[tn3], 5(%[dst]) \n\t" + + /* odd 4. pixel */ + "sb %[tp2], 1(%[dst]) \n\t" + "sb %[tp1], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbu %[tn1], 7(%[dst]) \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" + + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" + + "lbux %[n1], %[Temp1](%[cm]) \n\t" + "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" + + /* store bytes */ + "sb %[tn2], 3(%[dst]) \n\t" + "sb %[tn3], 5(%[dst]) \n\t" + "sb %[tn1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, + int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ + "ulw %[qload2], 16(%[src]) \n\t" + "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ + "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ + "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ + "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ + + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ + + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ + + "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ + "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ + "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), + [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), + [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), + [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + if (((const int32_t *)filter_x)[0] == 0) { + aom_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 8: + convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + case 16: + convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 1); + break; + case 32: + convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, + h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, + h); + break; + default: + aom_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, + h); + break; + } + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c new file mode 100644 index 000000000..c871702f4 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c @@ -0,0 +1,1590 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[p2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tn1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), + [dst_stride] "r"(dst_stride)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, + int32_t src_stride, uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4, n1; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__( + "ulw %[tp2], 0(%[src]) \n\t" + "ulw %[tp1], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp1] \n\t" + "preceu.ph.qbl %[p4], %[tp1] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tp3] \n\t" + "preceu.ph.qbl %[n1], %[tp3] \n\t" + "ulw %[tp2], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tp2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "lbux %[tp3], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp3], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "ulw %[tp1], 1(%[src]) \n\t" + "ulw %[tp3], 5(%[src]) \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[tp2], %[p3](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "ulw %[tp2], 9(%[src]) \n\t" + + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[n1], %[tp2] \n\t" + "ulw %[Temp1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[Temp1] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[n1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), + [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_horiz_16_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "ulw %[qload2], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "ulw %[qload1], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 16(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 17(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +static void convolve_horiz_64_transposed_dspr2( + const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, + int32_t dst_stride, const int16_t *filter_x0, int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) " + "\n\t" + "ulw %[qload2], 4(%[src]) " + "\n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 1 */ + "mthi $zero, $ac1 " + "\n\t" + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 2 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "ulw %[qload2], 8(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] " + "\n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 3 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "ulw %[qload1], 12(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] " + "\n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] " + "\n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 4 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + " \n\t" + "dpa.w.ph $ac3, %[p3], %[filter12] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] " + "\n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] " + "\n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 5 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 16(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] " + "\n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] " + "\n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* even 6 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter12] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] " + "\n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] " + "\n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* even 7 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 20(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* even 8 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 1 */ + "mthi $zero, $ac3 " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) " + "\n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) " + "\n\t" + "ulw %[qload2], 5(%[src]) " + "\n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 2 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload1] " + "\n\t" + "preceu.ph.qbl %[p2], %[qload1] " + "\n\t" + "preceu.ph.qbr %[p3], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p4], %[qload2] " + "\n\t" + "sb %[st1], 0(%[dst]) " + "\n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 9(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] " + "\n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] " + "\n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 3 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p1], %[qload2] " + "\n\t" + "preceu.ph.qbl %[p5], %[qload2] " + "\n\t" + "sb %[st2], 0(%[dst]) " + "\n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) " + "\n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] " + "\n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] " + "\n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 4 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbr %[p2], %[qload1] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] " + "\n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] " + "\n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 5 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbl %[p3], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload2], 17(%[src]) " + "\n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] " + "\n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] " + "\n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 " + "\n\t" /* odd 6 */ + "mthi $zero, $ac2 " + "\n\t" + "preceu.ph.qbr %[p4], %[qload2] " + "\n\t" + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] " + "\n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] " + "\n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 " + "\n\t" /* odd 7 */ + "mthi $zero, $ac3 " + "\n\t" + "preceu.ph.qbl %[p1], %[qload2] " + "\n\t" + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "ulw %[qload1], 21(%[src]) " + "\n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] " + "\n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] " + "\n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 " + "\n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 " + "\n\t" /* odd 8 */ + "mthi $zero, $ac1 " + "\n\t" + "preceu.ph.qbr %[p5], %[qload1] " + "\n\t" + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + "dpa.w.ph $ac3, %[p2], %[filter12] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] " + "\n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] " + "\n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 " + "\n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] " + "\n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] " + "\n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 " + "\n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) " + "\n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) " + "\n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) " + "\n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) " + "\n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st3], 0(%[odd_dst]) " + "\n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " + "\n\t" + + "sb %[st1], 0(%[odd_dst]) " + "\n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), + [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), + [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), + [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), + [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), + [dst_pitch_2] "r"(dst_pitch_2)); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + + dst_ptr += 1; + } +} + +void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter, int w, int h) { + int x, y, k; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + int sum = 0; + + for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k]; + + dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + + src += src_stride; + dst += 1; + } +} + +void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x * dst_stride] = src[x]; + } + + src += src_stride; + dst += 1; + } +} + +void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); + int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; + uint32_t pos = 38; + + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + if (intermediate_height < h) intermediate_height = h; + + /* copy the src to dst */ + if (filter_x[3] == 0x80) { + copy_horiz_transposed(src - src_stride * 3, src_stride, temp, + intermediate_height, w, intermediate_height); + } else if (((const int32_t *)filter_x)[0] == 0) { + aom_convolve2_dspr2(src - src_stride * 3, src_stride, temp, + intermediate_height, filter_x, w, intermediate_height); + } else { + src -= (src_stride * 3 + 3); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + + switch (w) { + case 4: + convolve_horiz_4_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + case 8: + convolve_horiz_8_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height, (w / 16)); + break; + case 64: + prefetch_load(src + 32); + convolve_horiz_64_transposed_dspr2(src, src_stride, temp, + intermediate_height, filter_x, + intermediate_height); + break; + default: + convolve_horiz_transposed(src, src_stride, temp, intermediate_height, + filter_x, w, intermediate_height); + break; + } + } + + /* copy the src to dst */ + if (filter_y[3] == 0x80) { + copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w); + } else if (((const int32_t *)filter_y)[0] == 0) { + aom_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride, + filter_y, h, w); + } else { + switch (h) { + case 4: + convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + case 8: + convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + case 16: + case 32: + convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w, (h / 16)); + break; + case 64: + convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst, + dst_stride, filter_y, w); + break; + default: + convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride, + filter_y, h, w); + break; + } + } +} + +void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: { + uint32_t tp1; + + /* 1 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], (%[src]) \n\t" + "sw %[tp1], (%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 8: { + uint32_t tp1, tp2; + + /* 2 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 16: { + uint32_t tp1, tp2, tp3, tp4; + + /* 4 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 32: { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + /* 8 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + case 64: { + uint32_t tp1, tp2, tp3, tp4; + uint32_t tp5, tp6, tp7, tp8; + + prefetch_load(src + 64); + prefetch_store(dst + 32); + + /* 16 word storage */ + for (y = h; y--;) { + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_load(src + src_stride + 64); + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "ulw %[tp4], 12(%[src]) \n\t" + "ulw %[tp5], 16(%[src]) \n\t" + "ulw %[tp6], 20(%[src]) \n\t" + "ulw %[tp7], 24(%[src]) \n\t" + "ulw %[tp8], 28(%[src]) \n\t" + + "sw %[tp1], 0(%[dst]) \n\t" /* store */ + "sw %[tp2], 4(%[dst]) \n\t" /* store */ + "sw %[tp3], 8(%[dst]) \n\t" /* store */ + "sw %[tp4], 12(%[dst]) \n\t" /* store */ + "sw %[tp5], 16(%[dst]) \n\t" /* store */ + "sw %[tp6], 20(%[dst]) \n\t" /* store */ + "sw %[tp7], 24(%[dst]) \n\t" /* store */ + "sw %[tp8], 28(%[dst]) \n\t" /* store */ + + "ulw %[tp1], 32(%[src]) \n\t" + "ulw %[tp2], 36(%[src]) \n\t" + "ulw %[tp3], 40(%[src]) \n\t" + "ulw %[tp4], 44(%[src]) \n\t" + "ulw %[tp5], 48(%[src]) \n\t" + "ulw %[tp6], 52(%[src]) \n\t" + "ulw %[tp7], 56(%[src]) \n\t" + "ulw %[tp8], 60(%[src]) \n\t" + + "sw %[tp1], 32(%[dst]) \n\t" /* store */ + "sw %[tp2], 36(%[dst]) \n\t" /* store */ + "sw %[tp3], 40(%[dst]) \n\t" /* store */ + "sw %[tp4], 44(%[dst]) \n\t" /* store */ + "sw %[tp5], 48(%[dst]) \n\t" /* store */ + "sw %[tp6], 52(%[dst]) \n\t" /* store */ + "sw %[tp7], 56(%[dst]) \n\t" /* store */ + "sw %[tp8], 60(%[dst]) \n\t" /* store */ + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), + [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), + [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) + : [src] "r"(src), [dst] "r"(dst)); + + src += src_stride; + dst += dst_stride; + } + } break; + default: + for (y = h; y--;) { + for (x = 0; x < w; ++x) { + dst[x] = src[x]; + } + + src += src_stride; + dst += dst_stride; + } + break; + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c new file mode 100644 index 000000000..c60557617 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c @@ -0,0 +1,878 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3, Temp4; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4; + uint32_t n1, n2, n3, n4; + uint32_t tn1, tn2; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[n1], %[tp2] \n\t" + "preceu.ph.qbl %[n2], %[tp2] \n\t" + "preceu.ph.qbr %[n3], %[tn2] \n\t" + "preceu.ph.qbl %[n4], %[tn2] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[n1], %[tn1] \n\t" + "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" + "extp %[Temp4], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[tn1], %[Temp2](%[cm]) \n\t" + "lbux %[n2], %[Temp4](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst]) \n\t" + "sb %[tn1], 1(%[dst]) \n\t" + "sb %[tp2], 2(%[dst]) \n\t" + "sb %[n2], 3(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), + [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2; + uint32_t p1, p2, p3, p4, n1; + uint32_t tn1, tn2, tn3; + uint32_t st0, st1; + + vector1b = ((const int32_t *)filter_x0)[0]; + vector2b = ((const int32_t *)filter_x0)[1]; + vector3b = ((const int32_t *)filter_x0)[2]; + vector4b = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_load(src + src_stride); + prefetch_load(src + src_stride + 32); + prefetch_store(dst + dst_stride); + + __asm__ __volatile__( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tn2], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "preceu.ph.qbr %[p1], %[tn2] \n\t" + "preceu.ph.qbl %[n1], %[tn2] \n\t" + "ulw %[tn1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[st0], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[tn1] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[st0], 0(%[dst]) \n\t" + "lbux %[st1], %[Temp3](%[cm]) \n\t" + + "balign %[tn3], %[tn1], 3 \n\t" + "balign %[tn1], %[tn2], 3 \n\t" + "balign %[tn2], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[st0], %[Temp1](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sb %[st1], 2(%[dst]) \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tn2] \n\t" + "preceu.ph.qbl %[p4], %[tn2] \n\t" + "sb %[st0], 4(%[dst]) \n\t" + "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tn1] \n\t" + "preceu.ph.qbl %[n1], %[tn1] \n\t" + "lbux %[st0], %[Temp3](%[cm]) \n\t" + "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[st1], %[Temp2](%[cm]) \n\t" + "preceu.ph.qbr %[p2], %[tn3] \n\t" + "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[st1], 1(%[dst]) \n\t" + "sb %[st0], 6(%[dst]) \n\t" + "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" + "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[n1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 3(%[dst]) \n\t" + "sb %[p2], 5(%[dst]) \n\t" + "sb %[n1], 7(%[dst]) \n\t" + + : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), + [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), + [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h, + int32_t count) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_store(dst_ptr + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, + uint8_t *dst_ptr, int32_t dst_stride, + const int16_t *filter_x0, int32_t h) { + int32_t y, c; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t filter12, filter34, filter56, filter78; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2, qload3; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + + filter12 = ((const int32_t *)filter_x0)[0]; + filter34 = ((const int32_t *)filter_x0)[1]; + filter56 = ((const int32_t *)filter_x0)[2]; + filter78 = ((const int32_t *)filter_x0)[3]; + + for (y = h; y--;) { + src = src_ptr; + dst = dst_ptr; + + /* prefetch data to cache memory */ + prefetch_load(src_ptr + src_stride); + prefetch_load(src_ptr + src_stride + 32); + prefetch_load(src_ptr + src_stride + 64); + prefetch_store(dst_ptr + dst_stride); + prefetch_store(dst_ptr + dst_stride + 32); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload3], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "ulw %[qload1], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ + "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ + "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ + "ulw %[qload2], 16(%[src]) \n\t" + "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ + "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ + "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ + "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ + "ulw %[qload3], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ + "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ + "ulw %[qload3], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ + "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload3] \n\t" + "preceu.ph.qbl %[p5], %[qload3] \n\t" + "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ + "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ + "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ + "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ + "ulw %[qload2], 17(%[src]) \n\t" + "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ + "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p4], %[qload2] \n\t" + "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ + "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ + "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbl %[p1], %[qload2] \n\t" + "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ + "ulw %[qload3], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ + "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload3] \n\t" + "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ + "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ + "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ + "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ + "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ + "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), + [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), + [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), + [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) + : [filter12] "r"(filter12), [filter34] "r"(filter34), + [filter56] "r"(filter56), [filter78] "r"(filter78), + [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), + [src] "r"(src)); + + src += 16; + dst += 16; + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(x_step_q4 == 16); + assert(((const int32_t *)filter_x)[1] != 0x800000); + + if (((const int32_t *)filter_x)[0] == 0) { + aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + prefetch_load((const uint8_t *)filter_x); + src -= 3; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + /* prefetch data to cache memory */ + prefetch_load(src); + prefetch_load(src + 32); + prefetch_store(dst); + + switch (w) { + case 4: + convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 8: + convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + case 16: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 1); + break; + case 32: + convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h, 2); + break; + case 64: + prefetch_load(src + 64); + prefetch_store(dst + 32); + + convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst, + (int32_t)dst_stride, filter_x, (int32_t)h); + break; + default: + aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c new file mode 100644 index 000000000..d8a90b6ab --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/convolve_common_dspr2.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_ports/mem.h" + +#if HAVE_DSPR2 +static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, + uint8_t *dst, int32_t dst_stride, + const int16_t *filter_y, int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = aom_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2, load3, load4; + uint32_t p1, p2; + uint32_t n1, n2; + uint32_t scratch1, scratch2; + uint32_t store1, store2; + int32_t vector1b, vector2b, vector3b, vector4b; + int32_t Temp1, Temp2; + + vector1b = ((const int32_t *)filter_y)[0]; + vector2b = ((const int32_t *)filter_y)[1]; + vector3b = ((const int32_t *)filter_y)[2]; + vector4b = ((const int32_t *)filter_y)[3]; + + src -= 3 * src_stride; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + prefetch_store(dst + dst_stride); + prefetch_store(dst + dst_stride + 32); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" + "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" + + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load3], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load4], 0(%[src_ptr]) \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbr %[scratch2], %[load3] \n\t" + "preceu.ph.qbr %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac0, 31 \n\t" + "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + "preceu.ph.qbl %[scratch2], %[load3] \n\t" + "preceu.ph.qbl %[p2], %[load4] \n\t" + "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ + "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" + "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" + "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), + [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), + [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), + [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), + [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) + : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), + [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), + [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), + [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} + +void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + assert(y_step_q4 == 16); + assert(((const int32_t *)filter_y)[1] != 0x800000); + + if (((const int32_t *)filter_y)[0] == 0) { + aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + } else { + uint32_t pos = 38; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + prefetch_store(dst); + + switch (w) { + case 4: + case 8: + case 16: + case 32: + convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h); + break; + case 64: + prefetch_store(dst + 32); + convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); + break; + default: + aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h); + break; + } + } +} + +#endif diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h new file mode 100644 index 000000000..f8fd9e2b6 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ +#define AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter, int w, + int h); + +void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h); + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c new file mode 100644 index 000000000..dc9c63226 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c @@ -0,0 +1,948 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/fwd_txfm_msa.h" + +static void fdct8x32_1d_column_load_butterfly(const int16_t *input, + int32_t src_stride, + int16_t *temp_buff) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 step0, step1, step2, step3; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + v8i16 step0_1, step1_1, step2_1, step3_1; + + /* 1st and 2nd set */ + LD_SH4(input, src_stride, in0, in1, in2, in3); + LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, + step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff, 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); + + /* 3rd and 4th set */ + LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); + LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); + LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); + LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); + SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, + step3, in4, in5, in6, in7); + BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, + step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); + ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); + ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); + ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); +} + +static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8i16 temp0, temp1; + + /* fdct even */ + LD_SH4(input, 8, in0, in1, in2, in3); + LD_SH4(input + 96, 8, in12, in13, in14, in15); + BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, + vec3, in12, in13, in14, in15); + LD_SH4(input + 32, 8, in4, in5, in6, in7); + LD_SH4(input + 64, 8, in8, in9, in10, in11); + BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, + in8, in9, in10, in11); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp); + ST_SH(temp1, temp + 512); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 256); + ST_SH(temp1, temp + 768); + + SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 128); + ST_SH(temp1, temp + 896); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 640); + ST_SH(temp1, temp + 384); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 64); + ST_SH(temp1, temp + 960); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 576); + ST_SH(temp1, temp + 448); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 320); + ST_SH(temp1, temp + 704); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT32_POSTPROC_2V_POS_H(temp0, temp1); + ST_SH(temp0, temp + 192); + ST_SH(temp1, temp + 832); +} + +static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(input + 32); + in21 = LD_SH(input + 40); + in26 = LD_SH(input + 80); + in27 = LD_SH(input + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(input + 16); + in19 = LD_SH(input + 24); + in28 = LD_SH(input + 96); + in29 = LD_SH(input + 104); + + vec4 = in19 - in20; + ST_SH(vec4, input + 32); + vec4 = in18 - in21; + ST_SH(vec4, input + 40); + vec4 = in29 - in26; + ST_SH(vec4, input + 80); + vec4 = in28 - in27; + ST_SH(vec4, input + 88); + + in21 = in18 + in21; + in20 = in19 + in20; + in27 = in28 + in27; + in26 = in29 + in26; + + LD_SH4(input + 48, 8, in22, in23, in24, in25); + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(input); + in17 = LD_SH(input + 8); + in30 = LD_SH(input + 112); + in31 = LD_SH(input + 120); + + vec4 = in17 - in22; + ST_SH(vec4, input + 16); + vec4 = in16 - in23; + ST_SH(vec4, input + 24); + vec4 = in31 - in24; + ST_SH(vec4, input + 96); + vec4 = in30 - in25; + ST_SH(vec4, input + 104); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr); + ST_SH(vec4, temp_ptr + 960); + + SUB2(in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 448); + ST_SH(vec4, temp_ptr + 512); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 704); + ST_SH(vec5, temp_ptr + 256); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec4, temp_ptr + 192); + ST_SH(vec5, temp_ptr + 768); + + LD_SH4(input + 16, 8, in22, in23, in20, in21); + LD_SH4(input + 80, 8, in26, in27, in24, in25); + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 832); + ST_SH(vec4, temp_ptr + 128); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 320); + ST_SH(vec4, temp_ptr + 640); + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 576); + ST_SH(vec4, temp_ptr + 384); + + ADD2(in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT32_POSTPROC_2V_POS_H(vec5, vec4); + ST_SH(vec5, temp_ptr + 64); + ST_SH(vec4, temp_ptr + 896); +} + +static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, + int16_t *tmp_buf, int16_t *tmp_buf_big) { + fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); + fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); + fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); +} + +static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, + int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 step0, step1, step2, step3, step4, step5, step6, step7; + + LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, + step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); + + /* 2nd set */ + LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, + step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, + (output + 8 * 8), 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); +} + +static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, + int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; + v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; + v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); + + /* Stage 3 */ + UNPCK_SH_SW(vec0, vec0_l, vec0_r); + UNPCK_SH_SW(vec1, vec1_l, vec1_r); + UNPCK_SH_SW(vec2, vec2_l, vec2_r); + UNPCK_SH_SW(vec3, vec3_l, vec3_r); + UNPCK_SH_SW(vec4, vec4_l, vec4_r); + UNPCK_SH_SW(vec5, vec5_l, vec5_r); + UNPCK_SH_SW(vec6, vec6_l, vec6_r); + UNPCK_SH_SW(vec7, vec7_l, vec7_r); + ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, + tmp1_w, tmp2_w, tmp3_w); + BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); + ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, + vec1_r, vec2_r, vec3_r); + + tmp3_w = vec0_r + vec3_r; + vec0_r = vec0_r - vec3_r; + vec3_r = vec1_r + vec2_r; + vec1_r = vec1_r - vec2_r; + + DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out, 8); + + DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, + vec4_r, tmp3_w, vec6_r, vec3_r); + FDCT32_POSTPROC_NEG_W(vec4_r); + FDCT32_POSTPROC_NEG_W(tmp3_w); + FDCT32_POSTPROC_NEG_W(vec6_r); + FDCT32_POSTPROC_NEG_W(vec3_r); + PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); + ST_SH2(vec5, vec4, out + 16, 8); + + LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 32); + ST_SH(in5, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 40); + ST_SH(in5, out + 48); + + LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 64); + ST_SH(in5, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 72); + ST_SH(in5, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 80); + ST_SH(in5, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); + FDCT_POSTPROC_2V_NEG_H(in4, in5); + ST_SH(in4, out + 96); + ST_SH(in5, out + 88); +} + +static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); + DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + FDCT_POSTPROC_2V_NEG_H(temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + ADD2(in28, in29, in31, in30, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + FDCT_POSTPROC_2V_NEG_H(vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; + + /* 1st set */ + in0 = LD_SH(temp); + in4 = LD_SH(temp + 32); + in2 = LD_SH(temp + 64); + in6 = LD_SH(temp + 96); + in1 = LD_SH(temp + 128); + in7 = LD_SH(temp + 152); + in3 = LD_SH(temp + 192); + in5 = LD_SH(temp + 216); + + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* 2nd set */ + in0_1 = LD_SH(temp + 16); + in1_1 = LD_SH(temp + 232); + in2_1 = LD_SH(temp + 80); + in3_1 = LD_SH(temp + 168); + in4_1 = LD_SH(temp + 48); + in5_1 = LD_SH(temp + 176); + in6_1 = LD_SH(temp + 112); + in7_1 = LD_SH(temp + 240); + + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + + /* 3rd set */ + in0 = LD_SH(temp + 8); + in1 = LD_SH(temp + 136); + in2 = LD_SH(temp + 72); + in3 = LD_SH(temp + 200); + in4 = LD_SH(temp + 40); + in5 = LD_SH(temp + 208); + in6 = LD_SH(temp + 104); + in7 = LD_SH(temp + 144); + + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8, + 32); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); + + /* 4th set */ + in0_1 = LD_SH(temp + 24); + in1_1 = LD_SH(temp + 224); + in2_1 = LD_SH(temp + 88); + in3_1 = LD_SH(temp + 160); + in4_1 = LD_SH(temp + 56); + in5_1 = LD_SH(temp + 184); + in6_1 = LD_SH(temp + 120); + in7_1 = LD_SH(temp + 248); + + TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, + in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); + ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24, + 32); +} + +static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { + fdct8x32_1d_row_load_butterfly(temp, temp_buf); + fdct8x32_1d_row_even(temp_buf, temp_buf); + fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); + fdct8x32_1d_row_transpose_store(temp_buf, output); +} + +static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); + fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void aom_fdct32x32_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, + tmp_buf_big + (8 * i)); + } + + /* row transform */ + fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); + + /* row transform */ + for (i = 1; i < 4; ++i) { + fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); + } +} + +static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; + + /* fdct32 even */ + /* stage 2 */ + LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); + + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, + vec7, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT_POSTPROC_2V_NEG_H(vec0, vec1); + FDCT_POSTPROC_2V_NEG_H(vec2, vec3); + FDCT_POSTPROC_2V_NEG_H(vec4, vec5); + FDCT_POSTPROC_2V_NEG_H(vec6, vec7); + FDCT_POSTPROC_2V_NEG_H(in8, in9); + FDCT_POSTPROC_2V_NEG_H(in10, in11); + FDCT_POSTPROC_2V_NEG_H(in12, in13); + FDCT_POSTPROC_2V_NEG_H(in14, in15); + + /* Stage 3 */ + ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); + + temp0 = in0 + in3; + in0 = in0 - in3; + in3 = in1 + in2; + in1 = in1 - in2; + + DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); + ST_SH(temp0, out); + ST_SH(temp1, out + 8); + + DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); + ST_SH(temp0, out + 16); + ST_SH(temp1, out + 24); + + SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); + DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); + ADD2(vec4, vec5, vec7, vec6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); + ST_SH(temp0, out + 32); + ST_SH(temp1, out + 56); + + SUB2(vec4, vec5, vec7, vec6, vec4, vec7); + DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); + ST_SH(temp0, out + 40); + ST_SH(temp1, out + 48); + + DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); + DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); + ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); + DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); + ADD2(in0, in1, in2, in3, vec0, vec7); + DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); + ST_SH(temp0, out + 64); + ST_SH(temp1, out + 120); + + SUB2(in0, in1, in2, in3, in0, in2); + DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); + ST_SH(temp0, out + 72); + ST_SH(temp1, out + 112); + + SUB2(in9, vec2, in14, vec5, vec2, vec5); + DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); + SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); + DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); + ST_SH(temp0, out + 80); + ST_SH(temp1, out + 104); + + ADD2(in3, in2, in0, in1, vec3, vec4); + DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); + ST_SH(temp0, out + 96); + ST_SH(temp1, out + 88); +} + +static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, + int16_t *out) { + v8i16 in16, in17, in18, in19, in20, in21, in22, in23; + v8i16 in24, in25, in26, in27, in28, in29, in30, in31; + v8i16 vec4, vec5; + + in20 = LD_SH(temp + 32); + in21 = LD_SH(temp + 40); + in26 = LD_SH(temp + 80); + in27 = LD_SH(temp + 88); + + DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); + DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); + + FDCT_POSTPROC_2V_NEG_H(in20, in21); + FDCT_POSTPROC_2V_NEG_H(in26, in27); + + in18 = LD_SH(temp + 16); + in19 = LD_SH(temp + 24); + in28 = LD_SH(temp + 96); + in29 = LD_SH(temp + 104); + + FDCT_POSTPROC_2V_NEG_H(in18, in19); + FDCT_POSTPROC_2V_NEG_H(in28, in29); + + vec4 = in19 - in20; + ST_SH(vec4, interm_ptr + 32); + vec4 = in18 - in21; + ST_SH(vec4, interm_ptr + 88); + vec4 = in29 - in26; + ST_SH(vec4, interm_ptr + 64); + vec4 = in28 - in27; + ST_SH(vec4, interm_ptr + 56); + + ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); + + in22 = LD_SH(temp + 48); + in23 = LD_SH(temp + 56); + in24 = LD_SH(temp + 64); + in25 = LD_SH(temp + 72); + + DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); + DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); + FDCT_POSTPROC_2V_NEG_H(in22, in23); + FDCT_POSTPROC_2V_NEG_H(in24, in25); + + in16 = LD_SH(temp); + in17 = LD_SH(temp + 8); + in30 = LD_SH(temp + 112); + in31 = LD_SH(temp + 120); + + FDCT_POSTPROC_2V_NEG_H(in16, in17); + FDCT_POSTPROC_2V_NEG_H(in30, in31); + + vec4 = in17 - in22; + ST_SH(vec4, interm_ptr + 40); + vec4 = in30 - in25; + ST_SH(vec4, interm_ptr + 48); + vec4 = in31 - in24; + ST_SH(vec4, interm_ptr + 72); + vec4 = in16 - in23; + ST_SH(vec4, interm_ptr + 80); + + ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); + DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); + DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); + ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); + DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); + ADD2(in27, in26, in25, in24, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); + ST_SH(vec5, out); + ST_SH(vec4, out + 120); + + SUB2(in27, in26, in25, in24, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); + ST_SH(vec5, out + 112); + ST_SH(vec4, out + 8); + + SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); + DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); + SUB2(in26, in27, in24, in25, in23, in20); + DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); + ST_SH(vec4, out + 16); + ST_SH(vec5, out + 104); + + ADD2(in26, in27, in24, in25, in22, in21); + DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); + ST_SH(vec4, out + 24); + ST_SH(vec5, out + 96); + + in20 = LD_SH(interm_ptr + 32); + in21 = LD_SH(interm_ptr + 88); + in27 = LD_SH(interm_ptr + 56); + in26 = LD_SH(interm_ptr + 64); + + in16 = in20; + in17 = in21; + DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); + DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); + + in22 = LD_SH(interm_ptr + 40); + in25 = LD_SH(interm_ptr + 48); + in24 = LD_SH(interm_ptr + 72); + in23 = LD_SH(interm_ptr + 80); + + SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); + DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); + in16 = in28 + in29; + in19 = in31 + in30; + DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); + ST_SH(vec5, out + 32); + ST_SH(vec4, out + 88); + + SUB2(in28, in29, in31, in30, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); + ST_SH(vec5, out + 40); + ST_SH(vec4, out + 80); + + ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); + DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); + SUB2(in29, in28, in30, in31, in16, in19); + DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); + ST_SH(vec5, out + 72); + ST_SH(vec4, out + 48); + + ADD2(in29, in28, in30, in31, in17, in18); + DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); + ST_SH(vec4, out + 56); + ST_SH(vec5, out + 64); +} + +static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, + int16_t *output) { + fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); + fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); + fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); + fdct8x32_1d_row_transpose_store(tmp_buf, output); +} + +void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); + DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); + + /* column transform */ + for (i = 0; i < 4; ++i) { + fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], + &tmp_buf_big[0] + (8 * i)); + } + + /* row transform */ + for (i = 0; i < 4; ++i) { + fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], + out + (8 * i * 32)); + } +} + +void aom_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + int sum = LD_HADD(input, stride); + sum += LD_HADD(input + 8, stride); + sum += LD_HADD(input + 16, stride); + sum += LD_HADD(input + 24, stride); + sum += LD_HADD(input + 32 * 8, stride); + sum += LD_HADD(input + 32 * 8 + 8, stride); + sum += LD_HADD(input + 32 * 8 + 16, stride); + sum += LD_HADD(input + 32 * 8 + 24, stride); + sum += LD_HADD(input + 32 * 16, stride); + sum += LD_HADD(input + 32 * 16 + 8, stride); + sum += LD_HADD(input + 32 * 16 + 16, stride); + sum += LD_HADD(input + 32 * 16 + 24, stride); + sum += LD_HADD(input + 32 * 24, stride); + sum += LD_HADD(input + 32 * 24 + 8, stride); + sum += LD_HADD(input + 32 * 24 + 16, stride); + sum += LD_HADD(input + 32 * 24 + 24, stride); + out[0] = (int16_t)(sum >> 3); +} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c new file mode 100644 index 000000000..f16d290c8 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/fwd_txfm_msa.h" + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride) { + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; + v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; + v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; + v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; + v8i16 coeff2 = { + -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 + }; + + LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, + in10, in11, in12, in13, in14, in15); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + SLLI_4V(in8, in9, in10, in11, 2); + SLLI_4V(in12, in13, in14, in15, 2); + ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); + ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); + SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); + SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); + + tmp_ptr += 16; + + /* stp 1 */ + ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); + ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); + + cnst4 = __msa_splati_h(coeff, 0); + stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); + + cnst5 = __msa_splati_h(coeff, 1); + cnst5 = __msa_ilvev_h(cnst5, cnst4); + stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); + stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); + stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); + + /* stp2 */ + BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); + BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); + ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); + ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); + SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); + + cnst0 = __msa_splati_h(coeff, 4); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); + + BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); + ILVRL_H2_SH(in15, in8, vec1, vec0); + SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr); + + cnst0 = __msa_splati_h(coeff2, 0); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 224); + + ILVRL_H2_SH(in14, in9, vec1, vec0); + SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); + ST_SH(in8, tmp_ptr + 128); + + cnst1 = __msa_splati_h(coeff2, 2); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 96); + + SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); + + cnst1 = __msa_splati_h(coeff, 3); + cnst1 = __msa_ilvev_h(cnst0, cnst1); + stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); + + /* stp4 */ + ADD2(stp34, stp25, stp33, stp22, in13, in10); + + ILVRL_H2_SH(in13, in10, vec1, vec0); + SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 64); + + cnst0 = __msa_splati_h(coeff2, 1); + cnst0 = __msa_ilvev_h(cnst1, cnst0); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 160); + + SUB2(stp34, stp25, stp33, stp22, in12, in11); + ILVRL_H2_SH(in12, in11, vec1, vec0); + SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); + cnst1 = __msa_ilvev_h(cnst1, cnst0); + + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); + ST_SH(in8, tmp_ptr + 192); + + cnst1 = __msa_splati_h(coeff2, 3); + cnst0 = __msa_ilvev_h(cnst0, cnst1); + in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); + ST_SH(in8, tmp_ptr + 32); +} + +void fdct16x8_1d_row(int16_t *input, int16_t *output) { + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 in8, in9, in10, in11, in12, in13, in14, in15; + + LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); + LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); + ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); + ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); + SRA_4V(in0, in1, in2, in3, 2); + SRA_4V(in4, in5, in6, in7, 2); + SRA_4V(in8, in9, in10, in11, 2); + SRA_4V(in12, in13, in14, in15, 2); + BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, + in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, + tmp7, in8, in9, in10, in11, in12, in13, in14, in15); + ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); + FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, + tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); + FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, + tmp1, in1, tmp2, in2, tmp3, in3); + ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); + TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, + tmp5, in5, tmp6, in6, tmp7, in7); + ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); +} + +void aom_fdct4x4_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3; + + LD_SH4(input, src_stride, in0, in1, in2, in3); + + /* fdct4 pre-process */ + { + v8i16 vec, mask; + v16i8 zero = { 0 }; + v16i8 one = __msa_ldi_b(1); + + mask = (v8i16)__msa_sldi_b(zero, one, 15); + SLLI_4V(in0, in1, in2, in3, 4); + vec = __msa_ceqi_h(in0, 0); + vec = vec ^ 255; + vec = mask & vec; + in0 += vec; + } + + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); + SRA_4V(in0, in1, in2, in3, 2); + PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); + ST_SH2(in0, in2, output, 8); +} + +void aom_fdct8x8_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); + SLLI_4V(in0, in1, in2, in3, 2); + SLLI_4V(in4, in5, in6, in7, 2); + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); + ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); +} + +void aom_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + out[0] = LD_HADD(input, stride); + out[1] = 0; +} + +void aom_fdct16x16_msa(const int16_t *input, int16_t *output, + int32_t src_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); + + /* column transform */ + for (i = 0; i < 2; ++i) { + fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); + } + + /* row transform */ + for (i = 0; i < 2; ++i) { + fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); + } +} + +void aom_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { + int sum = LD_HADD(input, stride); + sum += LD_HADD(input + 8, stride); + sum += LD_HADD(input + 16 * 8, stride); + sum += LD_HADD(input + 16 * 8 + 8, stride); + out[0] = (int16_t)(sum >> 1); +} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h new file mode 100644 index 000000000..ada25dffd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_FWD_TXFM_MSA_H_ +#define AOM_DSP_MIPS_FWD_TXFM_MSA_H_ + +#include "aom_dsp/mips/txfm_macros_msa.h" +#include "aom_dsp/txfm_common.h" + +#define LD_HADD(psrc, stride) \ + ({ \ + v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ + v4i32 vec_w_m; \ + \ + LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ + ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ + LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ + ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \ + in0_m, in4_m); \ + in0_m += in4_m; \ + \ + vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ + HADD_SW_S32(vec_w_m); \ + }) + +#define AOM_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ + v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ + v8i16 coeff_m = { \ + cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \ + }; \ + \ + BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ + ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \ + cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \ + vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \ + cnst2_m = __msa_splati_h(coeff_m, 2); \ + cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \ + vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ + \ + SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \ + vec7_m, out0, out2, out1, out3); \ + } + +#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ + { \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + \ + SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ + SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ + AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \ + in2, in3); \ + AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \ + in6, in7); \ + } + +#define AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ + v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ + s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + } + +#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v8i16 x0_m, x1_m, x2_m, x3_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ + \ + /* FDCT stage1 */ \ + BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ + s3_m, s4_m, s5_m, s6_m, s7_m); \ + BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ + ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ + ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ + SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x1_m, x0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ + x2_m = -x2_m; \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + x2_m = __msa_splati_h(coeff_m, 2); \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ + \ + /* stage2 */ \ + ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ + \ + s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ + s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ + \ + /* stage3 */ \ + BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ + \ + /* stage4 */ \ + ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ + ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ + \ + SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ + x1_m = __msa_ilvev_h(x0_m, x1_m); \ + out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ + \ + SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ + x2_m = __msa_ilvev_h(x3_m, x2_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + \ + x1_m = __msa_splati_h(coeff_m, 5); \ + x0_m = -x0_m; \ + x0_m = __msa_ilvev_h(x1_m, x0_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ + \ + x2_m = __msa_splati_h(coeff_m, 6); \ + x3_m = -x3_m; \ + x2_m = __msa_ilvev_h(x2_m, x3_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ + } + +#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ + input7, out1, out3, out5, out7, out9, out11, out13, \ + out15) \ + { \ + v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ + v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ + v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ + v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ + v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ + v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ + -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \ + v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \ + cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \ + v8i16 coeff2_m = { \ + -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \ + }; \ + \ + /* stp 1 */ \ + ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ + ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \ + \ + cnst4_m = __msa_splati_h(coeff_m, 0); \ + stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \ + \ + cnst5_m = __msa_splati_h(coeff_m, 1); \ + cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \ + stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \ + stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \ + stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ + \ + /* stp2 */ \ + BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \ + stp33_m); \ + BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \ + stp34_m); \ + \ + ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ + ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ + \ + SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 4); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + cnst0_m = __msa_splati_h(coeff_m, 3); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ + \ + /* stp4 */ \ + BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \ + vec5_m); \ + BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \ + stp31_m); \ + \ + ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + \ + out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 0); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 2); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + cnst0_m = __msa_splati_h(coeff2_m, 1); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + \ + ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \ + SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + \ + cnst1_m = __msa_splati_h(coeff2_m, 3); \ + cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + } + +#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ + { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one_m = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clti_s_h(vec0, 0); \ + tp1_m = __msa_clti_s_h(vec1, 0); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one_m & tp0_m; \ + tp1_m = one_m & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ + } + +#define FDCT32_POSTPROC_NEG_W(vec) \ + { \ + v4i32 temp_m; \ + v4i32 one_m = __msa_ldi_w(1); \ + \ + temp_m = __msa_clti_s_w(vec, 0); \ + vec += 1; \ + temp_m = one_m & temp_m; \ + vec += temp_m; \ + vec >>= 2; \ + } + +#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ + { \ + v8i16 tp0_m, tp1_m; \ + v8i16 one = __msa_ldi_h(1); \ + \ + tp0_m = __msa_clei_s_h(vec0, 0); \ + tp1_m = __msa_clei_s_h(vec1, 0); \ + tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ + tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ + vec0 += 1; \ + vec1 += 1; \ + tp0_m = one & tp0_m; \ + tp1_m = one & tp1_m; \ + vec0 += tp0_m; \ + vec1 += tp1_m; \ + vec0 >>= 2; \ + vec1 >>= 2; \ + } + +#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ + const0, const1, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ + v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ + v4i32 k0_m = __msa_fill_w((int32_t)const0); \ + \ + s0_m = __msa_fill_w((int32_t)const1); \ + k0_m = __msa_ilvev_w(s0_m, k0_m); \ + \ + ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ + ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ + ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ + ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ + \ + DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + \ + DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ + DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ + tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ + tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ + tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ + tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ + out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ + out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ + } + +void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, + int32_t src_stride); +void fdct16x8_1d_row(int16_t *input, int16_t *output); +#endif // AOM_DSP_MIPS_FWD_TXFM_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/idct16x16_msa.c b/third_party/aom/aom_dsp/mips/idct16x16_msa.c new file mode 100644 index 000000000..0ea127f52 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct16x16_msa.c @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { + v8i16 loc0, loc1, loc2, loc3; + v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; + v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; + v8i16 tmp5, tmp6, tmp7; + + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8; + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1, + reg2, reg3, reg4, reg5, reg6, reg7); + TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8, + reg9, reg10, reg11, reg12, reg13, reg14, reg15); + DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4, + reg8); + ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6, + reg10); + + /* stage 2 */ + DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + + reg9 = reg1 - loc2; + reg1 = reg1 + loc2; + reg7 = reg15 - loc3; + reg15 = reg15 + loc3; + + DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); + + loc1 = reg15 + reg3; + reg3 = reg15 - reg3; + loc2 = reg2 + loc1; + reg15 = reg2 - loc1; + + loc1 = reg1 + reg13; + reg13 = reg1 - reg13; + loc0 = reg0 + loc1; + loc1 = reg0 - loc1; + tmp6 = loc0; + tmp7 = loc1; + reg0 = loc2; + + DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); + + loc0 = reg9 + reg5; + reg5 = reg9 - reg5; + reg2 = reg6 + loc0; + reg1 = reg6 - loc0; + + loc0 = reg7 + reg11; + reg11 = reg7 - reg11; + loc1 = reg4 + loc0; + loc2 = reg4 - loc0; + tmp5 = loc1; + + DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); + + reg10 = loc0; + reg11 = loc1; + + DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); + + reg13 = loc2; + + /* Transpose and store the output */ + reg12 = tmp5; + reg14 = tmp6; + reg3 = tmp7; + + /* transpose block */ + TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0, + reg2, reg4, reg6, reg8, reg10, reg12, reg14); + ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); + + /* transpose block */ + TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3, + reg13, reg11, reg5, reg7, reg9, reg1, reg15); + ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); +} + +void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 loc0, loc1, loc2, loc3; + v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; + v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; + v8i16 tmp5, tmp6, tmp7; + + /* load up 8x8 */ + LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + input += 8 * 16; + /* load bottom 8x8 */ + LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); + + DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); + DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); + BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); + DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); + DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); + DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); + BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); + + reg0 = reg2 - loc1; + reg2 = reg2 + loc1; + reg12 = reg14 - loc0; + reg14 = reg14 + loc0; + reg4 = reg6 - loc3; + reg6 = reg6 + loc3; + reg8 = reg10 - loc2; + reg10 = reg10 + loc2; + + /* stage 2 */ + DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); + DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); + + reg9 = reg1 - loc2; + reg1 = reg1 + loc2; + reg7 = reg15 - loc3; + reg15 = reg15 + loc3; + + DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); + DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); + BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); + + loc1 = reg15 + reg3; + reg3 = reg15 - reg3; + loc2 = reg2 + loc1; + reg15 = reg2 - loc1; + + loc1 = reg1 + reg13; + reg13 = reg1 - reg13; + loc0 = reg0 + loc1; + loc1 = reg0 - loc1; + tmp6 = loc0; + tmp7 = loc1; + reg0 = loc2; + + DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); + DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); + + loc0 = reg9 + reg5; + reg5 = reg9 - reg5; + reg2 = reg6 + loc0; + reg1 = reg6 - loc0; + + loc0 = reg7 + reg11; + reg11 = reg7 - reg11; + loc1 = reg4 + loc0; + loc2 = reg4 - loc0; + tmp5 = loc1; + + DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); + BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); + + reg10 = loc0; + reg11 = loc1; + + DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); + BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); + reg13 = loc2; + + /* Transpose and store the output */ + reg12 = tmp5; + reg14 = tmp6; + reg3 = tmp7; + + SRARI_H4_SH(reg0, reg2, reg4, reg6, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6); + dst += (4 * dst_stride); + SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); + dst += (4 * dst_stride); + SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); + dst += (4 * dst_stride); + SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); +} + +void aom_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); + int16_t *out = out_arr; + + /* transform rows */ + for (i = 0; i < 2; ++i) { + /* process 16 * 8 block */ + aom_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); + } + + /* transform columns */ + for (i = 0; i < 2; ++i) { + /* process 8 * 16 block */ + aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + uint8_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); + int16_t *out = out_arr; + + /* process 16 * 8 block */ + aom_idct16_1d_rows_msa(input, out); + + /* short case just considers top 4 rows as valid output */ + out += 4 * 16; + for (i = 12; i--;) { + __asm__ __volatile__( + "sw $zero, 0(%[out]) \n\t" + "sw $zero, 4(%[out]) \n\t" + "sw $zero, 8(%[out]) \n\t" + "sw $zero, 12(%[out]) \n\t" + "sw $zero, 16(%[out]) \n\t" + "sw $zero, 20(%[out]) \n\t" + "sw $zero, 24(%[out]) \n\t" + "sw $zero, 28(%[out]) \n\t" + + : + : [out] "r"(out)); + + out += 16; + } + + out = out_arr; + + /* transform columns */ + for (i = 0; i < 2; ++i) { + /* process 8 * 16 block */ + aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + uint8_t i; + int16_t out; + v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 4; i--;) { + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, + tmp2, tmp3); + ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; + + /* load input data */ + LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, + l7, l15); + TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6, + l7); + TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11, + l12, l13, l14, l15); + + /* ADST in horizontal */ + AOM_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, + l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, + r12, r13, r14, r15); + + l1 = -r8; + l3 = -r4; + l13 = -r13; + l15 = -r1; + + TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5, + l6, l7); + ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); + TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12, + l13, l14, l15); + ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); +} + +void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 v0, v2, v4, v6, k0, k1, k2, k3; + v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; + v8i16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 out8, out9, out10, out11, out12, out13, out14, out15; + v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; + v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v8i16 res8, res9, res10, res11, res12, res13, res14, res15; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; + v16i8 zero = { 0 }; + + r0 = LD_SH(input + 0 * 16); + r3 = LD_SH(input + 3 * 16); + r4 = LD_SH(input + 4 * 16); + r7 = LD_SH(input + 7 * 16); + r8 = LD_SH(input + 8 * 16); + r11 = LD_SH(input + 11 * 16); + r12 = LD_SH(input + 12 * 16); + r15 = LD_SH(input + 15 * 16); + + /* stage 1 */ + k0 = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); + k1 = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); + k2 = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); + k3 = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); + MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); + k0 = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); + k1 = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); + k2 = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); + k3 = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); + MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); + BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0); + k0 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k1 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k2 = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); + MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); + + r1 = LD_SH(input + 1 * 16); + r2 = LD_SH(input + 2 * 16); + r5 = LD_SH(input + 5 * 16); + r6 = LD_SH(input + 6 * 16); + r9 = LD_SH(input + 9 * 16); + r10 = LD_SH(input + 10 * 16); + r13 = LD_SH(input + 13 * 16); + r14 = LD_SH(input + 14 * 16); + + k0 = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); + k1 = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); + k2 = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); + k3 = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); + MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7); + k0 = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); + k1 = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); + k2 = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); + k3 = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); + MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15); + BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4); + BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10); + out1 = -out1; + SRARI_H2_SH(out0, out1, 6); + dst0 = LD_UB(dst + 0 * dst_stride); + dst1 = LD_UB(dst + 15 * dst_stride); + ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1); + ADD2(res0, out0, res1, out1, res0, res1); + CLIP_SH2_0_255(res0, res1); + PCKEV_B2_SH(res0, res0, res1, res1, res0, res1); + ST8x1_UB(res0, dst); + ST8x1_UB(res1, dst + 15 * dst_stride); + + k0 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + k1 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k2 = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); + MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); + BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); + out8 = -out8; + + SRARI_H2_SH(out8, out9, 6); + dst8 = LD_UB(dst + 1 * dst_stride); + dst9 = LD_UB(dst + 14 * dst_stride); + ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9); + ADD2(res8, out8, res9, out9, res8, res9); + CLIP_SH2_0_255(res8, res9); + PCKEV_B2_SH(res8, res8, res9, res9, res8, res9); + ST8x1_UB(res8, dst + dst_stride); + ST8x1_UB(res9, dst + 14 * dst_stride); + + k0 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + k1 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k2 = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); + MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7); + out4 = -out4; + SRARI_H2_SH(out4, out5, 6); + dst4 = LD_UB(dst + 3 * dst_stride); + dst5 = LD_UB(dst + 12 * dst_stride); + ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5); + ADD2(res4, out4, res5, out5, res4, res5); + CLIP_SH2_0_255(res4, res5); + PCKEV_B2_SH(res4, res4, res5, res5, res4, res5); + ST8x1_UB(res4, dst + 3 * dst_stride); + ST8x1_UB(res5, dst + 12 * dst_stride); + + MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); + out13 = -out13; + SRARI_H2_SH(out12, out13, 6); + dst12 = LD_UB(dst + 2 * dst_stride); + dst13 = LD_UB(dst + 13 * dst_stride); + ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13); + ADD2(res12, out12, res13, out13, res12, res13); + CLIP_SH2_0_255(res12, res13); + PCKEV_B2_SH(res12, res12, res13, res13, res12, res13); + ST8x1_UB(res12, dst + 2 * dst_stride); + ST8x1_UB(res13, dst + 13 * dst_stride); + + k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k3 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + MADD_SHORT(out6, out7, k0, k3, out6, out7); + SRARI_H2_SH(out6, out7, 6); + dst6 = LD_UB(dst + 4 * dst_stride); + dst7 = LD_UB(dst + 11 * dst_stride); + ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7); + ADD2(res6, out6, res7, out7, res6, res7); + CLIP_SH2_0_255(res6, res7); + PCKEV_B2_SH(res6, res6, res7, res7, res6, res7); + ST8x1_UB(res6, dst + 4 * dst_stride); + ST8x1_UB(res7, dst + 11 * dst_stride); + + MADD_SHORT(out10, out11, k0, k3, out10, out11); + SRARI_H2_SH(out10, out11, 6); + dst10 = LD_UB(dst + 6 * dst_stride); + dst11 = LD_UB(dst + 9 * dst_stride); + ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11); + ADD2(res10, out10, res11, out11, res10, res11); + CLIP_SH2_0_255(res10, res11); + PCKEV_B2_SH(res10, res10, res11, res11, res10, res11); + ST8x1_UB(res10, dst + 6 * dst_stride); + ST8x1_UB(res11, dst + 9 * dst_stride); + + k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); + k2 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + MADD_SHORT(h10, h11, k1, k2, out2, out3); + SRARI_H2_SH(out2, out3, 6); + dst2 = LD_UB(dst + 7 * dst_stride); + dst3 = LD_UB(dst + 8 * dst_stride); + ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3); + ADD2(res2, out2, res3, out3, res2, res3); + CLIP_SH2_0_255(res2, res3); + PCKEV_B2_SH(res2, res2, res3, res3, res2, res3); + ST8x1_UB(res2, dst + 7 * dst_stride); + ST8x1_UB(res3, dst + 8 * dst_stride); + + MADD_SHORT(out14, out15, k1, k2, out14, out15); + SRARI_H2_SH(out14, out15, 6); + dst14 = LD_UB(dst + 5 * dst_stride); + dst15 = LD_UB(dst + 10 * dst_stride); + ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); + ADD2(res14, out14, res15, out15, res14, res15); + CLIP_SH2_0_255(res14, res15); + PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); + ST8x1_UB(res14, dst + 5 * dst_stride); + ST8x1_UB(res15, dst + 10 * dst_stride); +} diff --git a/third_party/aom/aom_dsp/mips/idct32x32_msa.c b/third_party/aom/aom_dsp/mips/idct32x32_msa.c new file mode 100644 index 000000000..f1ca757a0 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct32x32_msa.c @@ -0,0 +1,730 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +static void idct32x8_row_transpose_store(const int16_t *input, + int16_t *tmp_buf) { + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* 1st & 2nd 8x8 */ + LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); + ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); + ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); + + /* 3rd & 4th 8x8 */ + LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); + ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); + ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); + ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); +} + +static void idct32x8_row_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + + /* Even stage 1 */ + LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = reg0 + reg4; + reg0 = reg0 - reg4; + reg4 = reg6 + reg2; + reg6 = reg6 - reg2; + reg2 = reg1 + reg5; + reg1 = reg1 - reg5; + reg5 = reg7 + reg3; + reg7 = reg7 - reg3; + reg3 = vec0; + + vec1 = reg2; + reg2 = reg3 + reg4; + reg3 = reg3 - reg4; + reg4 = reg5 - vec1; + reg5 = reg5 + vec1; + + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = reg0 - reg6; + reg0 = reg0 + reg6; + vec1 = reg7 - reg1; + reg7 = reg7 + reg1; + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 15 * 8)); + ST_SH(loc1, (tmp_eve_buf)); + ST_SH(loc2, (tmp_eve_buf + 14 * 8)); + ST_SH(loc3, (tmp_eve_buf + 8)); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 13 * 8)); + ST_SH(loc1, (tmp_eve_buf + 2 * 8)); + ST_SH(loc2, (tmp_eve_buf + 12 * 8)); + ST_SH(loc3, (tmp_eve_buf + 3 * 8)); + + /* Store 8 */ + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 11 * 8)); + ST_SH(loc1, (tmp_eve_buf + 4 * 8)); + ST_SH(loc2, (tmp_eve_buf + 10 * 8)); + ST_SH(loc3, (tmp_eve_buf + 5 * 8)); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH(loc0, (tmp_eve_buf + 9 * 8)); + ST_SH(loc1, (tmp_eve_buf + 6 * 8)); + ST_SH(loc2, (tmp_eve_buf + 8 * 8)); + ST_SH(loc3, (tmp_eve_buf + 7 * 8)); +} + +static void idct32x8_row_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + reg0 = LD_SH(tmp_buf + 8); + reg1 = LD_SH(tmp_buf + 7 * 8); + reg2 = LD_SH(tmp_buf + 9 * 8); + reg3 = LD_SH(tmp_buf + 15 * 8); + reg4 = LD_SH(tmp_buf + 17 * 8); + reg5 = LD_SH(tmp_buf + 23 * 8); + reg6 = LD_SH(tmp_buf + 25 * 8); + reg7 = LD_SH(tmp_buf + 31 * 8); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = reg0 + reg3; + reg0 = reg0 - reg3; + reg3 = reg7 + reg4; + reg7 = reg7 - reg4; + reg4 = reg1 + reg2; + reg1 = reg1 - reg2; + reg2 = reg6 + reg5; + reg6 = reg6 - reg5; + reg5 = vec0; + + /* 4 Stores */ + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf), 8); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); + + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); + + /* Odd stage 2 */ + /* 8 loads */ + reg0 = LD_SH(tmp_buf + 3 * 8); + reg1 = LD_SH(tmp_buf + 5 * 8); + reg2 = LD_SH(tmp_buf + 11 * 8); + reg3 = LD_SH(tmp_buf + 13 * 8); + reg4 = LD_SH(tmp_buf + 19 * 8); + reg5 = LD_SH(tmp_buf + 21 * 8); + reg6 = LD_SH(tmp_buf + 27 * 8); + reg7 = LD_SH(tmp_buf + 29 * 8); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + + BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); + + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); + + /* 4 Stores */ + ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH(reg0, (tmp_odd_buf + 13 * 8)); + ST_SH(reg1, (tmp_odd_buf + 14 * 8)); + + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + + /* Load 8 & Store 8 */ + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); + + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); + + /* Load 8 & Store 8 */ + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); +} + +static void idct_butterfly_transpose_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, int16_t *dst) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + + ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + + ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + + ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + + ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); + ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); + ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); + ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); + + /* Transpose : 16 vectors */ + /* 1st & 2nd 8x8 */ + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + ST_SH4(m0, n0, m1, n1, (dst + 0), 32); + ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m4, n4, m5, n5, (dst + 8), 32); + ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); + + /* 3rd & 4th 8x8 */ + LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); + LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); + TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, + n3); + ST_SH4(m0, n0, m1, n1, (dst + 16), 32); + ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); + + TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, + n7); + ST_SH4(m4, n4, m5, n5, (dst + 24), 32); + ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); +} + +static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { + DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct32x8_row_transpose_store(input, &tmp_buf[0]); + idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); + idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); + idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], + output); +} + +static void idct8x32_column_even_process_store(int16_t *tmp_buf, + int16_t *tmp_eve_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; + + /* Even stage 1 */ + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + tmp_buf += (2 * 32); + + DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); + DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); + BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); + DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + + loc1 = vec3; + loc0 = vec1; + + DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); + DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); + BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); + BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); + BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); + + /* Even stage 2 */ + /* Load 8 */ + LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); + + DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); + DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); + + vec0 = reg0 + reg4; + reg0 = reg0 - reg4; + reg4 = reg6 + reg2; + reg6 = reg6 - reg2; + reg2 = reg1 + reg5; + reg1 = reg1 - reg5; + reg5 = reg7 + reg3; + reg7 = reg7 - reg3; + reg3 = vec0; + + vec1 = reg2; + reg2 = reg3 + reg4; + reg3 = reg3 - reg4; + reg4 = reg5 - vec1; + reg5 = reg5 + vec1; + + DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); + DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); + + vec0 = reg0 - reg6; + reg0 = reg0 + reg6; + vec1 = reg7 - reg1; + reg7 = reg7 + reg1; + + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); + DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); + + /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ + /* Store 8 */ + BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, tmp_eve_buf, 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); + + BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); + + /* Store 8 */ + BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); + + BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); + ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); + ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); +} + +static void idct8x32_column_odd_process_store(int16_t *tmp_buf, + int16_t *tmp_odd_buf) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + + /* Odd stage 1 */ + reg0 = LD_SH(tmp_buf + 32); + reg1 = LD_SH(tmp_buf + 7 * 32); + reg2 = LD_SH(tmp_buf + 9 * 32); + reg3 = LD_SH(tmp_buf + 15 * 32); + reg4 = LD_SH(tmp_buf + 17 * 32); + reg5 = LD_SH(tmp_buf + 23 * 32); + reg6 = LD_SH(tmp_buf + 25 * 32); + reg7 = LD_SH(tmp_buf + 31 * 32); + + DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); + DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); + DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); + DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); + + vec0 = reg0 + reg3; + reg0 = reg0 - reg3; + reg3 = reg7 + reg4; + reg7 = reg7 - reg4; + reg4 = reg1 + reg2; + reg1 = reg1 - reg2; + reg2 = reg6 + reg5; + reg6 = reg6 - reg5; + reg5 = vec0; + + /* 4 Stores */ + ADD2(reg5, reg4, reg3, reg2, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); + SUB2(reg5, reg4, reg3, reg2, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); + ST_SH2(vec0, vec1, tmp_odd_buf, 8); + + /* 4 Stores */ + DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); + DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); + BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); + ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); + DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); + ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); + + /* Odd stage 2 */ + /* 8 loads */ + reg0 = LD_SH(tmp_buf + 3 * 32); + reg1 = LD_SH(tmp_buf + 5 * 32); + reg2 = LD_SH(tmp_buf + 11 * 32); + reg3 = LD_SH(tmp_buf + 13 * 32); + reg4 = LD_SH(tmp_buf + 19 * 32); + reg5 = LD_SH(tmp_buf + 21 * 32); + reg6 = LD_SH(tmp_buf + 27 * 32); + reg7 = LD_SH(tmp_buf + 29 * 32); + + DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); + DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); + DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); + DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); + + /* 4 Stores */ + SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); + DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); + DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); + BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); + ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); + DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); + ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); + + /* 4 Stores */ + ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3); + BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); + ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); + DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); + ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); + + /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ + /* Load 8 & Store 8 */ + LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); + LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); + + SUB2(reg0, reg4, reg1, reg5, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg2, reg6, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); + + /* Load 8 & Store 8 */ + LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); + LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); + + ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); + + SUB2(reg0, reg4, reg3, reg7, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); + + SUB2(reg1, reg5, reg2, reg6, vec0, vec1); + DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); + ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); +} + +static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, + int16_t *tmp_odd_buf, uint8_t *dst, + int32_t dst_stride) { + v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; + v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; + + /* FINAL BUTTERFLY : Dependency on Even & Odd */ + vec0 = LD_SH(tmp_odd_buf); + vec1 = LD_SH(tmp_odd_buf + 9 * 8); + vec2 = LD_SH(tmp_odd_buf + 14 * 8); + vec3 = LD_SH(tmp_odd_buf + 6 * 8); + loc0 = LD_SH(tmp_eve_buf); + loc1 = LD_SH(tmp_eve_buf + 8 * 8); + loc2 = LD_SH(tmp_eve_buf + 4 * 8); + loc3 = LD_SH(tmp_eve_buf + 12 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); + SRARI_H4_SH(m0, m2, m4, m6, 6); + AOM_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); + SRARI_H4_SH(m0, m2, m4, m6, 6); + AOM_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4, + m6); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 4 * 8); + vec1 = LD_SH(tmp_odd_buf + 13 * 8); + vec2 = LD_SH(tmp_odd_buf + 10 * 8); + vec3 = LD_SH(tmp_odd_buf + 3 * 8); + loc0 = LD_SH(tmp_eve_buf + 2 * 8); + loc1 = LD_SH(tmp_eve_buf + 10 * 8); + loc2 = LD_SH(tmp_eve_buf + 6 * 8); + loc3 = LD_SH(tmp_eve_buf + 14 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); + SRARI_H4_SH(m1, m3, m5, m7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); + SRARI_H4_SH(m1, m3, m5, m7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5, + m7); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 2 * 8); + vec1 = LD_SH(tmp_odd_buf + 11 * 8); + vec2 = LD_SH(tmp_odd_buf + 12 * 8); + vec3 = LD_SH(tmp_odd_buf + 7 * 8); + loc0 = LD_SH(tmp_eve_buf + 1 * 8); + loc1 = LD_SH(tmp_eve_buf + 9 * 8); + loc2 = LD_SH(tmp_eve_buf + 5 * 8); + loc3 = LD_SH(tmp_eve_buf + 13 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); + SRARI_H4_SH(n0, n2, n4, n6, 6); + AOM_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); + SRARI_H4_SH(n0, n2, n4, n6, 6); + AOM_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4, + n6); + + /* Load 8 & Store 8 */ + vec0 = LD_SH(tmp_odd_buf + 5 * 8); + vec1 = LD_SH(tmp_odd_buf + 15 * 8); + vec2 = LD_SH(tmp_odd_buf + 8 * 8); + vec3 = LD_SH(tmp_odd_buf + 1 * 8); + loc0 = LD_SH(tmp_eve_buf + 3 * 8); + loc1 = LD_SH(tmp_eve_buf + 11 * 8); + loc2 = LD_SH(tmp_eve_buf + 7 * 8); + loc3 = LD_SH(tmp_eve_buf + 15 * 8); + + ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); + SRARI_H4_SH(n1, n3, n5, n7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7); + + SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); + SRARI_H4_SH(n1, n3, n5, n7, 6); + AOM_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5, + n7); +} + +static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride) { + DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); + DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); + + idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); + idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); + idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, + dst_stride); +} + +void aom_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + /* transform rows */ + for (i = 0; i < 4; ++i) { + /* process 32 * 8 block */ + idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); + } + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); + int16_t *out_ptr = out_arr; + + for (i = 32; i--;) { + __asm__ __volatile__( + "sw $zero, 0(%[out_ptr]) \n\t" + "sw $zero, 4(%[out_ptr]) \n\t" + "sw $zero, 8(%[out_ptr]) \n\t" + "sw $zero, 12(%[out_ptr]) \n\t" + "sw $zero, 16(%[out_ptr]) \n\t" + "sw $zero, 20(%[out_ptr]) \n\t" + "sw $zero, 24(%[out_ptr]) \n\t" + "sw $zero, 28(%[out_ptr]) \n\t" + "sw $zero, 32(%[out_ptr]) \n\t" + "sw $zero, 36(%[out_ptr]) \n\t" + "sw $zero, 40(%[out_ptr]) \n\t" + "sw $zero, 44(%[out_ptr]) \n\t" + "sw $zero, 48(%[out_ptr]) \n\t" + "sw $zero, 52(%[out_ptr]) \n\t" + "sw $zero, 56(%[out_ptr]) \n\t" + "sw $zero, 60(%[out_ptr]) \n\t" + + : + : [out_ptr] "r"(out_ptr)); + + out_ptr += 32; + } + + out_ptr = out_arr; + + /* rows: only upper-left 8x8 has non-zero coeff */ + idct32x8_1d_rows_msa(input, out_ptr); + + /* transform columns */ + for (i = 0; i < 4; ++i) { + /* process 8 * 32 block */ + idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), + dst_stride); + } +} + +void aom_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int32_t i; + int16_t out; + v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 6); + + vec = __msa_fill_h(out); + + for (i = 16; i--;) { + LD_UB2(dst, 16, dst0, dst1); + LD_UB2(dst + dst_stride, 16, dst2, dst3); + + UNPCK_UB_SH(dst0, res0, res4); + UNPCK_UB_SH(dst1, res1, res5); + UNPCK_UB_SH(dst2, res2, res6); + UNPCK_UB_SH(dst3, res3, res7); + ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); + ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); + CLIP_SH4_0_255(res0, res1, res2, res3); + CLIP_SH4_0_255(res4, res5, res6, res7); + PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, + tmp2, tmp3); + + ST_UB2(tmp0, tmp1, dst, 16); + dst += dst_stride; + ST_UB2(tmp2, tmp3, dst, 16); + dst += dst_stride; + } +} diff --git a/third_party/aom/aom_dsp/mips/idct4x4_msa.c b/third_party/aom/aom_dsp/mips/idct4x4_msa.c new file mode 100644 index 000000000..274818baa --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct4x4_msa.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3; + v4i32 in0_r, in1_r, in2_r, in3_r, in4_r; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in2, in3, in1); + TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); + UNPCK_R_SH_SW(in0, in0_r); + UNPCK_R_SH_SW(in2, in2_r); + UNPCK_R_SH_SW(in3, in3_r); + UNPCK_R_SH_SW(in1, in1_r); + SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT); + + in0_r += in2_r; + in3_r -= in1_r; + in4_r = (in0_r - in3_r) >> 1; + in1_r = in4_r - in1_r; + in2_r = in4_r - in2_r; + in0_r -= in1_r; + in3_r += in2_r; + + TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r); + + in0_r += in1_r; + in2_r -= in3_r; + in4_r = (in0_r - in2_r) >> 1; + in3_r = in4_r - in3_r; + in1_r = in4_r - in1_r; + in0_r -= in3_r; + in2_r += in1_r; + + PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1, + in2, in3); + ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); +} + +void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t a1, e1; + v8i16 in1, in0 = { 0 }; + + a1 = input[0] >> UNIT_QUANT_SHIFT; + e1 = a1 >> 1; + a1 -= e1; + + in0 = __msa_insert_h(in0, 0, a1); + in0 = __msa_insert_h(in0, 1, e1); + in0 = __msa_insert_h(in0, 2, e1); + in0 = __msa_insert_h(in0, 3, e1); + + in1 = in0 >> 1; + in0 -= in1; + + ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride); +} + +void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3; + + /* load vector elements of 4x4 block */ + LD4x4_SH(input, in0, in1, in2, in3); + /* rows */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* columns */ + TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); + /* rounding (add 2^3, divide by 2^4) */ + SRARI_H4_SH(in0, in1, in2, in3, 4); + ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); +} + +void aom_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t out; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO(out, 4); + vec = __msa_fill_h(out); + + ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride); +} diff --git a/third_party/aom/aom_dsp/mips/idct8x8_msa.c b/third_party/aom/aom_dsp/mips/idct8x8_msa.c new file mode 100644 index 000000000..981c103cd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/idct8x8_msa.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/inv_txfm_msa.h" + +void aom_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + + /* rows transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* 1D idct8x8 */ + AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* columns transform */ + TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* 1D idct8x8 */ + AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + /* add block and store 8x8 */ + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +void aom_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + v8i16 in0, in1, in2, in3, in4, in5, in6, in7; + v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3; + v4i32 tmp0, tmp1, tmp2, tmp3; + v8i16 zero = { 0 }; + + /* load vector elements of 8x8 block */ + LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); + TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); + + /* stage1 */ + ILVL_H2_SH(in3, in0, in2, in1, s0, s1); + k0 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); + k1 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); + k2 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); + k3 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5); + + /* stage2 */ + ILVR_H2_SH(in3, in1, in2, in0, s1, s0); + k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); + k1 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); + k2 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); + k3 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); + DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); + SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); + PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); + BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3); + + /* stage3 */ + s0 = __msa_ilvr_h(s6, s5); + + k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); + DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1); + SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS); + PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); + + /* stage4 */ + BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6, + in7); + TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + /* final rounding (add 2^4, divide by 2^5) and shift */ + SRARI_H4_SH(in0, in1, in2, in3, 5); + SRARI_H4_SH(in4, in5, in6, in7, 5); + + /* add block and store 8x8 */ + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); + dst += (4 * dst_stride); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); +} + +void aom_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst, + int32_t dst_stride) { + int16_t out; + int32_t val; + v8i16 vec; + + out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); + out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); + val = ROUND_POWER_OF_TWO(out, 5); + vec = __msa_fill_h(val); + + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); + dst += (4 * dst_stride); + AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); +} diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c new file mode 100644 index 000000000..dc8f20208 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + "lb %[tmp9], 8(%[left]) \n\t" + "lb %[tmp10], 9(%[left]) \n\t" + "lb %[tmp11], 10(%[left]) \n\t" + "lb %[tmp12], 11(%[left]) \n\t" + "lb %[tmp13], 12(%[left]) \n\t" + "lb %[tmp14], 13(%[left]) \n\t" + "lb %[tmp15], 14(%[left]) \n\t" + "lb %[tmp16], 15(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + "replv.qb %[tmp9], %[tmp9] \n\t" + "replv.qb %[tmp10], %[tmp10] \n\t" + "replv.qb %[tmp11], %[tmp11] \n\t" + "replv.qb %[tmp12], %[tmp12] \n\t" + "replv.qb %[tmp13], %[tmp13] \n\t" + "replv.qb %[tmp14], %[tmp14] \n\t" + "replv.qb %[tmp15], %[tmp15] \n\t" + "replv.qb %[tmp16], %[tmp16] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "sw %[tmp1], 8(%[dst]) \n\t" + "sw %[tmp1], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "sw %[tmp2], 8(%[dst]) \n\t" + "sw %[tmp2], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "sw %[tmp3], 8(%[dst]) \n\t" + "sw %[tmp3], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "sw %[tmp4], 8(%[dst]) \n\t" + "sw %[tmp4], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "sw %[tmp5], 8(%[dst]) \n\t" + "sw %[tmp5], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "sw %[tmp6], 8(%[dst]) \n\t" + "sw %[tmp6], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "sw %[tmp7], 8(%[dst]) \n\t" + "sw %[tmp7], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + "sw %[tmp8], 8(%[dst]) \n\t" + "sw %[tmp8], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp9], (%[dst]) \n\t" + "sw %[tmp9], 4(%[dst]) \n\t" + "sw %[tmp9], 8(%[dst]) \n\t" + "sw %[tmp9], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp10], (%[dst]) \n\t" + "sw %[tmp10], 4(%[dst]) \n\t" + "sw %[tmp10], 8(%[dst]) \n\t" + "sw %[tmp10], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp11], (%[dst]) \n\t" + "sw %[tmp11], 4(%[dst]) \n\t" + "sw %[tmp11], 8(%[dst]) \n\t" + "sw %[tmp11], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp12], (%[dst]) \n\t" + "sw %[tmp12], 4(%[dst]) \n\t" + "sw %[tmp12], 8(%[dst]) \n\t" + "sw %[tmp12], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp13], (%[dst]) \n\t" + "sw %[tmp13], 4(%[dst]) \n\t" + "sw %[tmp13], 8(%[dst]) \n\t" + "sw %[tmp13], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp14], (%[dst]) \n\t" + "sw %[tmp14], 4(%[dst]) \n\t" + "sw %[tmp14], 8(%[dst]) \n\t" + "sw %[tmp14], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp15], (%[dst]) \n\t" + "sw %[tmp15], 4(%[dst]) \n\t" + "sw %[tmp15], 8(%[dst]) \n\t" + "sw %[tmp15], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp16], (%[dst]) \n\t" + "sw %[tmp16], 4(%[dst]) \n\t" + "sw %[tmp16], 8(%[dst]) \n\t" + "sw %[tmp16], 12(%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9), + [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12), + [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15), + [tmp16] "=&r"(tmp16) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, left2; + + __asm__ __volatile__( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "lw %[above1], 8(%[above]) \n\t" + "lw %[above2], 12(%[above]) \n\t" + "lw %[left1], 8(%[left]) \n\t" + "lw %[left2], 12(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "preceu.ph.qbl %[above_l1], %[above2] \n\t" + "preceu.ph.qbr %[above_r1], %[above2] \n\t" + "preceu.ph.qbl %[left_l1], %[left2] \n\t" + "preceu.ph.qbr %[left_r1], %[left2] \n\t" + + "addu.ph %[average], %[average], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[above_r1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addiu %[average], %[average], 16 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 5 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + "sw %[expected_dc], 8(%[dst]) \n\t" + "sw %[expected_dc], 12(%[dst]) \n\t" + + : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1), + [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1), + [above_r1] "=&r"(above_r1), [above2] "=&r"(above2), + [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c new file mode 100644 index 000000000..ea7c02810 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "sw %[tmp1], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; + + __asm__ __volatile__( + "lw %[above_c], (%[above]) \n\t" + "lw %[left_c], (%[left]) \n\t" + + "preceu.ph.qbl %[above_l], %[above_c] \n\t" + "preceu.ph.qbr %[above_r], %[above_c] \n\t" + "preceu.ph.qbl %[left_l], %[left_c] \n\t" + "preceu.ph.qbr %[left_r], %[left_c] \n\t" + + "addu.ph %[average], %[above_r], %[above_l] \n\t" + "addu.ph %[average], %[average], %[left_l] \n\t" + "addu.ph %[average], %[average], %[left_r] \n\t" + "addiu %[average], %[average], 4 \n\t" + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 3 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + + : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l), + [above_r] "=&r"(above_r), [left_c] "=&r"(left_c), + [left_l] "=&r"(left_l), [left_r] "=&r"(left_r), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} + +void aom_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t abovel, abover; + int32_t left0, left1, left2, left3; + int32_t res0, res1; + int32_t resl; + int32_t resr; + int32_t top_left; + uint8_t *cm = aom_ff_cropTbl; + + __asm__ __volatile__( + "ulw %[resl], (%[above]) \n\t" + + "lbu %[left0], (%[left]) \n\t" + "lbu %[left1], 1(%[left]) \n\t" + "lbu %[left2], 2(%[left]) \n\t" + "lbu %[left3], 3(%[left]) \n\t" + + "lbu %[top_left], -1(%[above]) \n\t" + + "preceu.ph.qbl %[abovel], %[resl] \n\t" + "preceu.ph.qbr %[abover], %[resl] \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "replv.ph %[left1], %[left1] \n\t" + "replv.ph %[left2], %[left2] \n\t" + "replv.ph %[left3], %[left3] \n\t" + + "replv.ph %[top_left], %[top_left] \n\t" + + "addu.ph %[resl], %[abovel], %[left0] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left0] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left1] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left1] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sb %[res1], 1(%[dst]) \n\t" + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left2] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left2] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sb %[res1], 1(%[dst]) \n\t" + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "addu.ph %[resl], %[abovel], %[left3] \n\t" + "subu.ph %[resl], %[resl], %[top_left] \n\t" + + "addu.ph %[resr], %[abover], %[left3] \n\t" + "subu.ph %[resr], %[resr], %[top_left] \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + + "sll %[res0], %[resr], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + + "sra %[res1], %[resr], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "sb %[res0], (%[dst]) \n\t" + + "sll %[res0], %[resl], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "lbux %[res0], %[res0](%[cm]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + + "sra %[res1], %[resl], 16 \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + + "sb %[res0], 2(%[dst]) \n\t" + "sb %[res1], 3(%[dst]) \n\t" + + : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0), + [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0), + [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl), + [resr] "=&r"(resr), [top_left] "=&r"(top_left) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride), [cm] "r"(cm)); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c new file mode 100644 index 000000000..1114fbc00 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/common_dspr2.h" + +#if HAVE_DSPR2 +void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + + __asm__ __volatile__( + "lb %[tmp1], (%[left]) \n\t" + "lb %[tmp2], 1(%[left]) \n\t" + "lb %[tmp3], 2(%[left]) \n\t" + "lb %[tmp4], 3(%[left]) \n\t" + "lb %[tmp5], 4(%[left]) \n\t" + "lb %[tmp6], 5(%[left]) \n\t" + "lb %[tmp7], 6(%[left]) \n\t" + "lb %[tmp8], 7(%[left]) \n\t" + + "replv.qb %[tmp1], %[tmp1] \n\t" + "replv.qb %[tmp2], %[tmp2] \n\t" + "replv.qb %[tmp3], %[tmp3] \n\t" + "replv.qb %[tmp4], %[tmp4] \n\t" + "replv.qb %[tmp5], %[tmp5] \n\t" + "replv.qb %[tmp6], %[tmp6] \n\t" + "replv.qb %[tmp7], %[tmp7] \n\t" + "replv.qb %[tmp8], %[tmp8] \n\t" + + "sw %[tmp1], (%[dst]) \n\t" + "sw %[tmp1], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp2], (%[dst]) \n\t" + "sw %[tmp2], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp3], (%[dst]) \n\t" + "sw %[tmp3], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp4], (%[dst]) \n\t" + "sw %[tmp4], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp5], (%[dst]) \n\t" + "sw %[tmp5], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp6], (%[dst]) \n\t" + "sw %[tmp6], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp7], (%[dst]) \n\t" + "sw %[tmp7], 4(%[dst]) \n\t" + "add %[dst], %[dst], %[stride] \n\t" + "sw %[tmp8], (%[dst]) \n\t" + "sw %[tmp8], 4(%[dst]) \n\t" + + : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3), + [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7), + [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8) + : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); +} + +void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t expected_dc; + int32_t average; + int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; + int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; + + __asm__ __volatile__( + "lw %[above1], (%[above]) \n\t" + "lw %[above2], 4(%[above]) \n\t" + "lw %[left1], (%[left]) \n\t" + "lw %[left2], 4(%[left]) \n\t" + + "preceu.ph.qbl %[above_l1], %[above1] \n\t" + "preceu.ph.qbr %[above_r1], %[above1] \n\t" + "preceu.ph.qbl %[left_l1], %[left1] \n\t" + "preceu.ph.qbr %[left_r1], %[left1] \n\t" + + "preceu.ph.qbl %[above_l2], %[above2] \n\t" + "preceu.ph.qbr %[above_r2], %[above2] \n\t" + "preceu.ph.qbl %[left_l2], %[left2] \n\t" + "preceu.ph.qbr %[left_r2], %[left2] \n\t" + + "addu.ph %[average], %[above_r1], %[above_l1] \n\t" + "addu.ph %[average], %[average], %[left_l1] \n\t" + "addu.ph %[average], %[average], %[left_r1] \n\t" + + "addu.ph %[average], %[average], %[above_l2] \n\t" + "addu.ph %[average], %[average], %[above_r2] \n\t" + "addu.ph %[average], %[average], %[left_l2] \n\t" + "addu.ph %[average], %[average], %[left_r2] \n\t" + + "addiu %[average], %[average], 8 \n\t" + + "srl %[tmp], %[average], 16 \n\t" + "addu.ph %[average], %[tmp], %[average] \n\t" + "srl %[expected_dc], %[average], 4 \n\t" + "replv.qb %[expected_dc], %[expected_dc] \n\t" + + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + "add %[dst], %[dst], %[stride] \n\t" + "sw %[expected_dc], (%[dst]) \n\t" + "sw %[expected_dc], 4(%[dst]) \n\t" + + : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1), + [above_r1] "=&r"(above_r1), [left1] "=&r"(left1), + [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1), + [above2] "=&r"(above2), [above_l2] "=&r"(above_l2), + [above_r2] "=&r"(above_r2), [left2] "=&r"(left2), + [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2), + [average] "=&r"(average), [tmp] "=&r"(tmp), + [expected_dc] "=&r"(expected_dc) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride)); +} + +void aom_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + int32_t abovel, abover; + int32_t abovel_1, abover_1; + int32_t left0; + int32_t res0, res1, res2, res3; + int32_t reshw; + int32_t top_left; + uint8_t *cm = aom_ff_cropTbl; + + __asm__ __volatile__( + "ulw %[reshw], (%[above]) \n\t" + "ulw %[top_left], 4(%[above]) \n\t" + + "lbu %[left0], (%[left]) \n\t" + + "preceu.ph.qbl %[abovel], %[reshw] \n\t" + "preceu.ph.qbr %[abover], %[reshw] \n\t" + "preceu.ph.qbl %[abovel_1], %[top_left] \n\t" + "preceu.ph.qbr %[abover_1], %[top_left] \n\t" + + "lbu %[top_left], -1(%[above]) \n\t" + "replv.ph %[left0], %[left0] \n\t" + + "replv.ph %[top_left], %[top_left] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 1(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 2(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 3(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 4(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 5(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 6(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbu %[left0], 7(%[left]) \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + "replv.ph %[left0], %[left0] \n\t" + "add %[dst], %[dst], %[stride] \n\t" + + "addu.ph %[reshw], %[abovel], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], (%[dst]) \n\t" + "sb %[res1], 1(%[dst]) \n\t" + "sb %[res2], 2(%[dst]) \n\t" + "sb %[res3], 3(%[dst]) \n\t" + + "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res2], %[reshw], 16 \n\t" + "sra %[res2], %[res2], 16 \n\t" + "sra %[res3], %[reshw], 16 \n\t" + + "addu.ph %[reshw], %[abover_1], %[left0] \n\t" + "subu.ph %[reshw], %[reshw], %[top_left] \n\t" + + "sll %[res0], %[reshw], 16 \n\t" + "sra %[res0], %[res0], 16 \n\t" + "sra %[res1], %[reshw], 16 \n\t" + + "lbux %[res0], %[res0](%[cm]) \n\t" + "lbux %[res1], %[res1](%[cm]) \n\t" + "lbux %[res2], %[res2](%[cm]) \n\t" + "lbux %[res3], %[res3](%[cm]) \n\t" + + "sb %[res0], 4(%[dst]) \n\t" + "sb %[res1], 5(%[dst]) \n\t" + "sb %[res2], 6(%[dst]) \n\t" + "sb %[res3], 7(%[dst]) \n\t" + + : [abovel] "=&r"(abovel), [abover] "=&r"(abover), + [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1), + [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3), + [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw), + [top_left] "=&r"(top_left) + : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), + [stride] "r"(stride), [cm] "r"(cm)); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c new file mode 100644 index 000000000..e8eaec7a9 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c @@ -0,0 +1,739 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ + { \ + out0 = __msa_subs_u_h(out0, in0); \ + out1 = __msa_subs_u_h(out1, in1); \ + } + +static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t src_data; + + src_data = LW(src); + + SW4(src_data, src_data, src_data, src_data, dst, dst_stride); +} + +static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint32_t src_data1, src_data2; + + src_data1 = LW(src); + src_data2 = LW(src + 4); + + for (row = 8; row--;) { + SW(src_data1, dst); + SW(src_data2, (dst + 4)); + dst += dst_stride; + } +} + +static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src0; + + src0 = LD_UB(src); + + for (row = 16; row--;) { + ST_UB(src0, dst); + dst += dst_stride; + } +} + +static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 src1, src2; + + src1 = LD_UB(src); + src2 = LD_UB(src + 16); + + for (row = 32; row--;) { + ST_UB2(src1, src2, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t out0, out1, out2, out3; + + out0 = src[0] * 0x01010101; + out1 = src[1] * 0x01010101; + out2 = src[2] * 0x01010101; + out3 = src[3] * 0x01010101; + + SW4(out0, out1, out2, out3, dst, dst_stride); +} + +static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t out0, out1, out2, out3, out4, out5, out6, out7; + + out0 = src[0] * 0x0101010101010101ull; + out1 = src[1] * 0x0101010101010101ull; + out2 = src[2] * 0x0101010101010101ull; + out3 = src[3] * 0x0101010101010101ull; + out4 = src[4] * 0x0101010101010101ull; + out5 = src[5] * 0x0101010101010101ull; + out6 = src[6] * 0x0101010101010101ull; + out7 = src[7] * 0x0101010101010101ull; + + SD4(out0, out1, out2, out3, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out4, out5, out6, out7, dst, dst_stride); +} + +static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 4; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB4(src0, src1, src2, src3, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + uint8_t inp0, inp1, inp2, inp3; + v16u8 src0, src1, src2, src3; + + for (row = 8; row--;) { + inp0 = src[0]; + inp1 = src[1]; + inp2 = src[2]; + inp3 = src[3]; + src += 4; + + src0 = (v16u8)__msa_fill_b(inp0); + src1 = (v16u8)__msa_fill_b(inp1); + src2 = (v16u8)__msa_fill_b(inp2); + src3 = (v16u8)__msa_fill_b(inp3); + + ST_UB2(src0, src0, dst, 16); + dst += dst_stride; + ST_UB2(src1, src1, dst, 16); + dst += dst_stride; + ST_UB2(src2, src2, dst, 16); + dst += dst_stride; + ST_UB2(src3, src3, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_4x4_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0, val1; + v16i8 store, src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LW(src_top); + val1 = LW(src_left); + INSERT_W2_SB(val0, val1, src); + sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t val0; + v16i8 store, data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + + val0 = LW(src); + data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); + sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_w((v4i32)store, 0); + + SW4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_w((v4i32)store, 0); + + SW4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_8x8_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0, val1; + v16i8 store; + v16u8 src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src_top); + val1 = LD(src_left); + INSERT_D2_UB(val0, val1, src); + sum_h = __msa_hadd_u_h(src, src); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint64_t val0; + v16i8 store; + v16u8 data = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + val0 = LD(src); + data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); + store = __msa_splati_b((v16i8)sum_w, 0); + val0 = __msa_copy_u_d((v2i64)store, 0); + + SD4(val0, val0, val0, val0, dst, dst_stride); + dst += (4 * dst_stride); + SD4(val0, val0, val0, val0, dst, dst_stride); +} + +static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { + uint64_t out; + const v16i8 store = __msa_ldi_b(128); + + out = __msa_copy_u_d((v2i64)store, 0); + + SD4(out, out, out, out, dst, dst_stride); + dst += (4 * dst_stride); + SD4(out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_16x16_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + v16u8 top, left, out; + v8u16 sum_h, sum_top, sum_left; + v4u32 sum_w; + v2u64 sum_d; + + top = LD_UB(src_top); + left = LD_UB(src_left); + HADD_UB2_UH(top, left, sum_top, sum_left); + sum_h = sum_top + sum_left; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + v16u8 data, out; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + data = LD_UB(src); + sum_h = __msa_hadd_u_h(data, data); + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { + const v16u8 out = (v16u8)__msa_ldi_b(128); + + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); + dst += (8 * dst_stride); + ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); +} + +static void intra_predict_dc_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 top0, top1, left0, left1, out; + v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src_top, 16, top0, top1); + LD_UB2(src_left, 16, left0, left1); + HADD_UB2_UH(top0, top1, sum_top0, sum_top1); + HADD_UB2_UH(left0, left1, sum_left0, sum_left1); + sum_h = sum_top0 + sum_top1; + sum_h += sum_left0 + sum_left1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst, + int32_t dst_stride) { + uint32_t row; + v16u8 data0, data1, out; + v8u16 sum_h, sum_data0, sum_data1; + v4u32 sum_w; + v2u64 sum_d; + + LD_UB2(src, 16, data0, data1); + HADD_UB2_UH(data0, data1, sum_data0, sum_data1); + sum_h = sum_data0 + sum_data1; + sum_w = __msa_hadd_u_w(sum_h, sum_h); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); + sum_d = __msa_hadd_u_d(sum_w, sum_w); + sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); + out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { + uint32_t row; + const v16u8 out = (v16u8)__msa_ldi_b(128); + + for (row = 16; row--;) { + ST_UB2(out, out, dst, 16); + dst += dst_stride; + ST_UB2(out, out, dst, 16); + dst += dst_stride; + } +} + +static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint32_t val; + uint8_t top_left = src_top_ptr[-1]; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v16u8 src0, src1, src2, src3; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + + src_top_left = (v8u16)__msa_fill_h(top_left); + val = LW(src_top_ptr); + src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); + + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); +} + +static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint64_t val; + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; + v8u16 src_top_left, vec0, vec1, vec2, vec3; + v16u8 src0, src1, src2, src3; + + val = LD(src_top_ptr); + src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 2; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, + src_left3, src_top, src0, src1, src2, src3); + HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); + SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); + PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); + ST8x4_UB(tmp0, tmp1, dst, dst_stride); + dst += (4 * dst_stride); + } +} + +static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint8_t top_left = src_top_ptr[-1]; + uint32_t loop_cnt; + v16i8 src_top, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r, res_l; + + src_top = LD_SB(src_top_ptr); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 4; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVRL_B2_UH(src_left0, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left1, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left2, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + + ILVRL_B2_UH(src_left3, src_top, res_r, res_l); + HADD_UB2_UH(res_r, res_l, res_r, res_l); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); + SAT_UH2_UH(res_r, res_l, 7); + PCKEV_ST_SB(res_r, res_l, dst); + dst += dst_stride; + } +} + +static void intra_predict_tm_32x32_msa(const uint8_t *src_top, + const uint8_t *src_left, uint8_t *dst, + int32_t dst_stride) { + uint8_t top_left = src_top[-1]; + uint32_t loop_cnt; + v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; + v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; + + LD_SB2(src_top, 16, src_top0, src_top1); + src_top_left = (v8u16)__msa_fill_h(top_left); + + for (loop_cnt = 8; loop_cnt--;) { + src_left0 = __msa_fill_b(src_left[0]); + src_left1 = __msa_fill_b(src_left[1]); + src_left2 = __msa_fill_b(src_left[2]); + src_left3 = __msa_fill_b(src_left[3]); + src_left += 4; + + ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + + ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); + ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); + HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); + IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); + SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); + PCKEV_ST_SB(res_r0, res_l0, dst); + PCKEV_ST_SB(res_r1, res_l1, dst + 16); + dst += dst_stride; + } +} + +void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_4x4_msa(above, dst, y_stride); +} + +void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_8x8_msa(above, dst, y_stride); +} + +void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_16x16_msa(above, dst, y_stride); +} + +void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_vert_32x32_msa(above, dst, y_stride); +} + +void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_4x4_msa(left, dst, y_stride); +} + +void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_8x8_msa(left, dst, y_stride); +} + +void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_16x16_msa(left, dst, y_stride); +} + +void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_horiz_32x32_msa(left, dst, y_stride); +} + +void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_4x4_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_8x8_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_16x16_msa(above, left, dst, y_stride); +} + +void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_dc_32x32_msa(above, left, dst, y_stride); +} + +void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_4x4_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_8x8_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_16x16_msa(above, dst, y_stride); +} + +void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + + intra_predict_dc_tl_32x32_msa(above, dst, y_stride); +} + +void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_4x4_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_8x8_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_16x16_msa(left, dst, y_stride); +} + +void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + + intra_predict_dc_tl_32x32_msa(left, dst, y_stride); +} + +void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_4x4_msa(dst, y_stride); +} + +void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_8x8_msa(dst, y_stride); +} + +void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_16x16_msa(dst, y_stride); +} + +void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + + intra_predict_128dc_32x32_msa(dst, y_stride); +} + +void aom_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_4x4_msa(above, left, dst, y_stride); +} + +void aom_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_8x8_msa(above, left, dst, y_stride); +} + +void aom_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_16x16_msa(above, left, dst, y_stride); +} + +void aom_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, + const uint8_t *above, const uint8_t *left) { + intra_predict_tm_32x32_msa(above, left, dst, y_stride); +} diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h new file mode 100644 index 000000000..8a85e26f3 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ +#define AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/inv_txfm.h" +#include "aom_dsp/mips/common_dspr2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ + ({ \ + \ + int32_t tmp, out; \ + int dct_cost_rounding = DCT_CONST_ROUNDING; \ + int in = input; \ + \ + __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac1 " \ + " \n\t" \ + "mthi $zero, $ac1 " \ + " \n\t" \ + "madd $ac1, %[in], " \ + "%[cospi_16_64] \n\t" \ + "extp %[tmp], $ac1, " \ + "31 \n\t" \ + \ + /* out = dct_const_round_shift(out * cospi_16_64); */ \ + "mtlo %[dct_cost_rounding], $ac2 " \ + " \n\t" \ + "mthi $zero, $ac2 " \ + " \n\t" \ + "madd $ac2, %[tmp], " \ + "%[cospi_16_64] \n\t" \ + "extp %[out], $ac2, " \ + "31 \n\t" \ + \ + : [tmp] "=&r"(tmp), [out] "=r"(out) \ + : [in] "r"(in), \ + [dct_cost_rounding] "r"(dct_cost_rounding), \ + [cospi_16_64] "r"(cospi_16_64)); \ + out; \ + }) + +void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); +void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output); +void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); +void iadst4_dspr2(const int16_t *input, int16_t *output); +void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride); +void iadst8_dspr2(const int16_t *input, int16_t *output); +void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride); +void iadst16_dspr2(const int16_t *input, int16_t *output); + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h new file mode 100644 index 000000000..122667aa8 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_INV_TXFM_MSA_H_ +#define AOM_DSP_MIPS_INV_TXFM_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/mips/txfm_macros_msa.h" +#include "aom_dsp/txfm_common.h" + +#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ + v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ + v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ + cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ + v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ + cospi_24_64, -cospi_24_64, 0, 0 }; \ + \ + SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ + ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in7, in0, in4, in3); \ + \ + SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ + cnst2_m = -cnst0_m; \ + ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ + SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ + cnst4_m = -cnst2_m; \ + ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ + cnst2_m, cnst3_m, in5, in2, in6, in1); \ + BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ + out7 = -s0_m; \ + out0 = s1_m; \ + \ + SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ + \ + ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ + cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + cnst1_m = cnst0_m; \ + \ + ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ + ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ + DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ + cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ + \ + SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ + cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ + \ + ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ + ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ + out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ + out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ + out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ + out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ + \ + out1 = -out1; \ + out3 = -out3; \ + out5 = -out5; \ + } + +#define AOM_SET_COSPI_PAIR(c0_h, c1_h) \ + ({ \ + v8i16 out0_m, r0_m, r1_m; \ + \ + r0_m = __msa_fill_h(c0_h); \ + r1_m = __msa_fill_h(c1_h); \ + out0_m = __msa_ilvev_h(r1_m, r0_m); \ + \ + out0_m; \ + }) + +#define AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ + { \ + uint8_t *dst_m = (uint8_t *)(dst); \ + v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ + v16i8 tmp0_m, tmp1_m; \ + v16i8 zero_m = { 0 }; \ + v8i16 res0_m, res1_m, res2_m, res3_m; \ + \ + LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ + ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \ + res0_m, res1_m, res2_m, res3_m); \ + ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \ + res2_m, res3_m); \ + CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ + PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ + } + +#define AOM_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 c0_m, c1_m, c2_m, c3_m; \ + v8i16 step0_m, step1_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + c0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + c1_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + step0_m = __msa_ilvr_h(in2, in0); \ + DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + \ + c2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + c3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + step1_m = __msa_ilvr_h(in3, in1); \ + DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + \ + PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ + SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ + BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \ + out0, out1, out2, out3); \ + } + +#define AOM_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 res0_m, res1_m, c0_m, c1_m; \ + v8i16 k1_m, k2_m, k3_m, k4_m; \ + v8i16 zero_m = { 0 }; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v4i32 int0_m, int1_m, int2_m, int3_m; \ + v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \ + -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \ + \ + SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ + ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ + int0_m = tmp2_m + tmp1_m; \ + \ + SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ + ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int1_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ + ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ + DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ + int2_m = tmp0_m + tmp1_m; \ + \ + c0_m = __msa_splati_h(mask_m, 6); \ + c0_m = __msa_ilvev_h(c0_m, k1_m); \ + \ + res0_m = __msa_ilvr_h((in1), (in3)); \ + tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ + int3_m = tmp2_m + tmp0_m; \ + \ + res0_m = __msa_ilvr_h((in2), (in3)); \ + c1_m = __msa_ilvev_h(k4_m, k3_m); \ + \ + tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ + res1_m = __msa_ilvr_h((in0), (in2)); \ + c1_m = __msa_ilvev_h(k1_m, zero_m); \ + \ + tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ + int3_m += tmp2_m; \ + int3_m += tmp3_m; \ + \ + SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ + PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ + } + +#define AV1_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ + ({ \ + v8i16 c0_m, c1_m; \ + \ + SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ + c0_m = __msa_ilvev_h(c1_m, c0_m); \ + \ + c0_m; \ + }) + +/* multiply and add macro */ +#define AV1_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ + out2, out3) \ + { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ + DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ + cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \ + DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ + cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \ + } + +/* idct 8x8 macro */ +#define AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ + cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ + \ + k0_m = AV1_SET_CONST_PAIR(mask_m, 0, 5); \ + k1_m = AV1_SET_CONST_PAIR(mask_m, 1, 0); \ + k2_m = AV1_SET_CONST_PAIR(mask_m, 6, 3); \ + k3_m = AV1_SET_CONST_PAIR(mask_m, 3, 2); \ + AV1_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ + SUB2(in1, in3, in7, in5, res0_m, res1_m); \ + k0_m = AV1_SET_CONST_PAIR(mask_m, 4, 7); \ + k1_m = __msa_splati_h(mask_m, 4); \ + \ + ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ + DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + tp4_m = in1 + in3; \ + PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ + tp7_m = in7 + in5; \ + k2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + AV1_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \ + BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ + BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \ + out1, out2, out3, out4, out5, out6, out7); \ + } + +#define AV1_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ + v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ + v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ + v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \ + cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ + v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \ + -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ + v8i16 mask3_m = { \ + -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \ + }; \ + \ + k0_m = AV1_SET_CONST_PAIR(mask1_m, 0, 1); \ + k1_m = AV1_SET_CONST_PAIR(mask1_m, 1, 2); \ + ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k0_m = AV1_SET_CONST_PAIR(mask1_m, 6, 7); \ + k1_m = AV1_SET_CONST_PAIR(mask2_m, 0, 1); \ + ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ + r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ + k0_m = AV1_SET_CONST_PAIR(mask1_m, 3, 4); \ + k1_m = AV1_SET_CONST_PAIR(mask1_m, 4, 5); \ + ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k0_m = AV1_SET_CONST_PAIR(mask2_m, 2, 3); \ + k1_m = AV1_SET_CONST_PAIR(mask2_m, 3, 4); \ + ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ + r5_m, r6_m, r7_m); \ + ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ + SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ + ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ + BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ + k0_m = AV1_SET_CONST_PAIR(mask2_m, 5, 6); \ + k1_m = AV1_SET_CONST_PAIR(mask2_m, 6, 7); \ + ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ + r1_m, r2_m, r3_m); \ + k1_m = AV1_SET_CONST_PAIR(mask3_m, 0, 1); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \ + r6_m, r7_m); \ + ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ + SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ + m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ + k0_m = AV1_SET_CONST_PAIR(mask3_m, 2, 2); \ + k1_m = AV1_SET_CONST_PAIR(mask3_m, 2, 3); \ + ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ + DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \ + m1_m, m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ + ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ + DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \ + m2_m, m3_m); \ + SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ + \ + out1 = -in1; \ + out3 = -in3; \ + out5 = -in5; \ + out7 = -in7; \ + } + +#define AOM_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \ + r12, r13, r14, r15, out0, out1, out2, out3, out4, \ + out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ + v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ + v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ + v8i16 h8_m, h9_m, h10_m, h11_m; \ + v8i16 k0_m, k1_m, k2_m, k3_m; \ + \ + /* stage 1 */ \ + k0_m = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ + MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ + MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ + MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \ + g11_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ + k3_m = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ + MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \ + g15_m); \ + \ + /* stage 2 */ \ + k0_m = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ + k2_m = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ + MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \ + h3_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ + k1_m = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ + MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \ + h6_m, h7_m); \ + BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ + BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \ + h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ + \ + /* stage 3 */ \ + BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ + k0_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ + k1_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ + k2_m = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ + MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \ + out7); \ + MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \ + out13, out15); \ + \ + /* stage 4 */ \ + k0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ + k1_m = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ + k2_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ + k3_m = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ + MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ + MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ + MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ + MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ + } + +void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride); +void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output); +void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, + int32_t dst_stride); +void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output); +#endif // AOM_DSP_MIPS_INV_TXFM_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c new file mode 100644 index 000000000..c63b1e857 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c @@ -0,0 +1,1190 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void idct16_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_10, step1_11, step1_12, step1_13; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + + for (i = no_rows; i--;) { + /* prefetch row */ + prefetch_load((const uint8_t *)(input + 16)); + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), + [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), + [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_8] "=r"(step2_8), + [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), + [step2_14] "=r"(step2_14) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), + [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), + [step1_13] "=r"(step1_13) + : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), + [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), + [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), + [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step2_12] \n\t" + "add %[load5], %[load5], %[step2_15] \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step2_13] \n\t" + "add %[load6], %[load6], %[step2_14] \n\t" + "sh %[load5], 0(%[output]) \n\t" + "sh %[load6], 32(%[output]) \n\t" + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "add %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + "add %[load6], %[load6], %[step2_11] \n\t" + "sh %[load5], 192(%[output]) \n\t" + "sh %[load6], 224(%[output]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "sub %[load5], %[load5], %[step2_11] \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step2_9] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "sh %[load5], 256(%[output]) \n\t" + "sh %[load6], 288(%[output]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_14] \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_15] \n\t" + "sh %[load5], 448(%[output]) \n\t" + "sh %[load6], 480(%[output]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6) + : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), + [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), + [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), + [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), + [step2_14] "r"(step2_14), [step2_15] "r"(step2_15)); + + __asm__ __volatile__( + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sh %[load5], 64(%[output]) \n\t" + "sh %[load6], 96(%[output]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sh %[load5], 128(%[output]) \n\t" + "sh %[load6], 160(%[output]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sh %[load5], 320(%[output]) \n\t" + "sh %[load6], 352(%[output]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sh %[load5], 384(%[output]) \n\t" + "sh %[load6], 416(%[output]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6) + : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), + [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), + [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13)); + + input += 16; + output += 1; + } +} + +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { + int i; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int step1_8, step1_9, step1_10, step1_11; + int step1_12, step1_13, step1_14, step1_15; + int step2_0, step2_1, step2_2, step2_3; + int step2_8, step2_9, step2_10, step2_11; + int step2_12, step2_13, step2_14, step2_15; + int load1, load2, load3, load4, load5, load6, load7, load8; + int result1, result2, result3, result4; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 16; ++i) { + dest_pix = (dest + i); + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 16(%[input]) \n\t" + "lh %[load3], 8(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[step2_0], $ac1, 31 \n\t" + "extp %[step2_1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[step2_2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[step2_3], $ac1, 31 \n\t" + + "add %[step1_0], %[step2_0], %[step2_3] \n\t" + "add %[step1_1], %[step2_1], %[step2_2] \n\t" + "sub %[step1_2], %[step2_1], %[step2_2] \n\t" + "sub %[step1_3], %[step2_0], %[step2_3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), + [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), + [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load5], 2(%[input]) \n\t" + "lh %[load6], 30(%[input]) \n\t" + "lh %[load7], 18(%[input]) \n\t" + "lh %[load8], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_30_64] \n\t" + "msub $ac1, %[load6], %[cospi_2_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_14_64] \n\t" + "msub $ac3, %[load8], %[cospi_18_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_18_64] \n\t" + "madd $ac1, %[load8], %[cospi_14_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_2_64] \n\t" + "madd $ac2, %[load6], %[cospi_30_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "sub %[load5], %[result1], %[result2] \n\t" + "sub %[load6], %[result4], %[result3] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load6], %[cospi_24_64] \n\t" + "msub $ac1, %[load5], %[cospi_8_64] \n\t" + "madd $ac3, %[load5], %[cospi_24_64] \n\t" + "madd $ac3, %[load6], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[result1], %[result2] \n\t" + "add %[step2_15], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_8] "=r"(step2_8), + [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), + [step2_14] "=r"(step2_14) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 22(%[input]) \n\t" + "lh %[load3], 26(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load3], %[cospi_6_64] \n\t" + "msub $ac3, %[load4], %[cospi_26_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load1], %[cospi_10_64] \n\t" + "madd $ac1, %[load2], %[cospi_22_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load3], %[cospi_26_64] \n\t" + "madd $ac2, %[load4], %[cospi_6_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[result2], %[result1] \n\t" + "sub %[load2], %[result4], %[result3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[result1], %[result2] \n\t" + "add %[step2_12], %[result4], %[result3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); + + __asm__ __volatile__( + "lh %[load5], 4(%[input]) \n\t" + "lh %[load6], 28(%[input]) \n\t" + "lh %[load7], 20(%[input]) \n\t" + "lh %[load8], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load5], %[cospi_28_64] \n\t" + "msub $ac1, %[load6], %[cospi_4_64] \n\t" + "extp %[result1], $ac1, 31 \n\t" + + "madd $ac3, %[load7], %[cospi_12_64] \n\t" + "msub $ac3, %[load8], %[cospi_20_64] \n\t" + "extp %[result2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac1, %[load7], %[cospi_20_64] \n\t" + "madd $ac1, %[load8], %[cospi_12_64] \n\t" + "extp %[result3], $ac1, 31 \n\t" + + "madd $ac2, %[load5], %[cospi_4_64] \n\t" + "madd $ac2, %[load6], %[cospi_28_64] \n\t" + "extp %[result4], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[result4], %[result3] \n\t" + "sub %[load5], %[load5], %[result1] \n\t" + "add %[load5], %[load5], %[result2] \n\t" + + "sub %[load6], %[result1], %[result2] \n\t" + "sub %[load6], %[load6], %[result3] \n\t" + "add %[load6], %[load6], %[result4] \n\t" + + "madd $ac1, %[load5], %[cospi_16_64] \n\t" + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + + "add %[step1_4], %[result1], %[result2] \n\t" + "add %[step1_7], %[result4], %[result3] \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [result1] "=&r"(result1), + [result2] "=&r"(result2), [result3] "=&r"(result3), + [result4] "=&r"(result4), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "sub %[load5], %[step2_14], %[step2_13] \n\t" + "sub %[load5], %[load5], %[step2_9] \n\t" + "add %[load5], %[load5], %[step2_10] \n\t" + + "madd $ac0, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_14], %[step2_13] \n\t" + "sub %[load6], %[load6], %[step2_10] \n\t" + "add %[load6], %[load6], %[step2_9] \n\t" + + "madd $ac1, %[load6], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load5], %[step2_15], %[step2_12] \n\t" + "sub %[load5], %[load5], %[step2_8] \n\t" + "add %[load5], %[load5], %[step2_11] \n\t" + + "madd $ac2, %[load5], %[cospi_16_64] \n\t" + + "sub %[load6], %[step2_15], %[step2_12] \n\t" + "sub %[load6], %[load6], %[step2_11] \n\t" + "add %[load6], %[load6], %[step2_8] \n\t" + + "madd $ac3, %[load6], %[cospi_16_64] \n\t" + + "extp %[step1_10], $ac0, 31 \n\t" + "extp %[step1_13], $ac1, 31 \n\t" + "extp %[step1_11], $ac2, 31 \n\t" + "extp %[step1_12], $ac3, 31 \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), + [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), + [step1_13] "=r"(step1_13) + : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), + [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), + [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), + [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), + [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); + + step1_8 = step2_8 + step2_11; + step1_9 = step2_9 + step2_10; + step1_14 = step2_13 + step2_14; + step1_15 = step2_12 + step2_15; + + __asm__ __volatile__( + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_0], %[step1_7] \n\t" + "add %[load5], %[load5], %[step1_15] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_1], %[step1_6] \n\t" + "add %[load6], %[load6], %[step1_14] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_2], %[step1_5] \n\t" + "add %[load5], %[load5], %[step1_13] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_3], %[step1_4] \n\t" + "add %[load6], %[load6], %[step1_12] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_3], %[step1_4] \n\t" + "add %[load5], %[load5], %[step1_11] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_2], %[step1_5] \n\t" + "add %[load6], %[load6], %[step1_10] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "sub %[load5], %[step1_1], %[step1_6] \n\t" + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[load5], %[step1_9] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_0], %[step1_7] \n\t" + "add %[load6], %[load6], %[step1_8] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_0], %[step1_7] \n\t" + "sub %[load5], %[load5], %[step1_8] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_1], %[step1_6] \n\t" + "sub %[load6], %[load6], %[step1_9] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "sub %[load5], %[step1_2], %[step1_5] \n\t" + "sub %[load5], %[load5], %[step1_10] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "sub %[load6], %[step1_3], %[step1_4] \n\t" + "sub %[load6], %[load6], %[step1_11] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_3], %[step1_4] \n\t" + "sub %[load5], %[load5], %[step1_12] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_2], %[step1_5] \n\t" + "sub %[load6], %[load6], %[step1_13] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[load7], 0(%[dest_pix]) \n\t" + "add %[load5], %[step1_1], %[step1_6] \n\t" + "sub %[load5], %[load5], %[step1_14] \n\t" + "addi %[load5], %[load5], 32 \n\t" + "sra %[load5], %[load5], 6 \n\t" + "add %[load7], %[load7], %[load5] \n\t" + "lbux %[load5], %[load7](%[cm]) \n\t" + "add %[load6], %[step1_0], %[step1_7] \n\t" + "sub %[load6], %[load6], %[step1_15] \n\t" + "sb %[load5], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[load8], 0(%[dest_pix]) \n\t" + "addi %[load6], %[load6], 32 \n\t" + "sra %[load6], %[load6], 6 \n\t" + "add %[load8], %[load8], %[load6] \n\t" + "lbux %[load6], %[load8](%[cm]) \n\t" + "sb %[load6], 0(%[dest_pix]) \n\t" + + : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), + [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix) + : + [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), + [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), + [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), + [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), + [step1_14] "r"(step1_14), [step1_15] "r"(step1_15)); + + input += 16; + } +} + +void aom_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct16_rows_dspr2(input, out, 16); + + // Then transform columns and add to dest + idct16_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void aom_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[16 * 16]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 4x4 area, we only need to calculate first 4 rows here. + idct16_rows_dspr2(input, outptr, 4); + + outptr += 4; + for (i = 0; i < 6; ++i) { + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 128(%[outptr]) \n\t" + "sw $zero, 160(%[outptr]) \n\t" + "sw $zero, 192(%[outptr]) \n\t" + "sw $zero, 224(%[outptr]) \n\t" + "sw $zero, 256(%[outptr]) \n\t" + "sw $zero, 288(%[outptr]) \n\t" + "sw $zero, 320(%[outptr]) \n\t" + "sw $zero, 352(%[outptr]) \n\t" + "sw $zero, 384(%[outptr]) \n\t" + "sw $zero, 416(%[outptr]) \n\t" + "sw $zero, 448(%[outptr]) \n\t" + "sw $zero, 480(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + outptr += 2; + } + + // Then transform columns + idct16_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void aom_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst16_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = output[8] = output[9] = output[10] = + output[11] = output[12] = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = -x8; + output[2] = x12; + output[3] = -x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = -x13; + output[14] = x9; + output[15] = -x1; +} + +#endif // HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c new file mode 100644 index 000000000..d469d1ad0 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c @@ -0,0 +1,1042 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; + int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; + int16_t step1_27, step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; + int16_t step3_28, step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int i, temp21; + uint8_t *dest_pix, *dest_pix1; + const int const_2_power_13 = 8192; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 32; ++i) { + dest_pix = dest + i; + dest_pix1 = dest + i + 31 * dest_stride; + + __asm__ __volatile__( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), + [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), + [step1_31] "=r"(step1_31) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), + [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), + [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), + [step1_29] "=r"(step1_29) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), + [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), + [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), + [step1_27] "=r"(step1_27) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), + [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), + [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), + [step1_25] "=r"(step1_25) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), + [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), + [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), + [step2_15] "=r"(step2_15) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), + [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), + [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), + [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), + [step3_15] "=r"(step3_15) + : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), + [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), + [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), + [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), + [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r"(step3_18) + : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), + [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r"(step3_19) + : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), + [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r"(step3_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r"(step3_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); + + __asm__ __volatile__( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + // stage 7 + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), + [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) + : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), + [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) + : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), + [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_0], %[step2_31] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_1], %[step2_30] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_2], %[step2_29] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_3], %[step2_28] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), + [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), + [step2_29] "r"(step2_29), [step2_30] "r"(step2_30), + [step2_31] "r"(step2_31)); + + step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_4], %[step1_27] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_5], %[step1_26] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_6], %[step1_25] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_7], %[step1_24] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4), + [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), + [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), + [step1_25] "r"(step1_25), [step1_26] "r"(step1_26), + [step1_27] "r"(step1_27)); + + step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_8], %[step1_23] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_9], %[step1_22] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_10], %[step1_21] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_11], %[step1_20] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8), + [step1_9] "r"(step1_9), [step1_10] "r"(step1_10), + [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), + [step1_21] "r"(step1_21), [step1_22] "r"(step1_22), + [step1_23] "r"(step1_23)); + + step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_12], %[step2_19] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_13], %[step2_18] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix]) \n\t" + "add %[temp0], %[step1_14], %[step2_17] \n\t" + "addi %[temp0], %[temp0], 32 \n\t" + "sra %[temp0], %[temp0], 6 \n\t" + "add %[temp2], %[temp2], %[temp0] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "add %[temp1], %[step1_15], %[step2_16] \n\t" + "sb %[temp0], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix]) \n\t" + "addi %[temp1], %[temp1], 32 \n\t" + "sra %[temp1], %[temp1], 6 \n\t" + "add %[temp3], %[temp3], %[temp1] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix]) \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), + [step1_14] "r"(step1_14), [step1_15] "r"(step1_15), + [step2_16] "r"(step2_16), [step2_17] "r"(step2_17), + [step2_18] "r"(step2_18), [step2_19] "r"(step2_19)); + + step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); + step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); + step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); + step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); + + __asm__ __volatile__( + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_15] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_14] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + + "lbu %[temp2], 0(%[dest_pix1]) \n\t" + "add %[temp2], %[temp2], %[step3_13] \n\t" + "lbux %[temp0], %[temp2](%[cm]) \n\t" + "sb %[temp0], 0(%[dest_pix1]) \n\t" + "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "lbu %[temp3], 0(%[dest_pix1]) \n\t" + "add %[temp3], %[temp3], %[step3_12] \n\t" + "lbux %[temp1], %[temp3](%[cm]) \n\t" + "sb %[temp1], 0(%[dest_pix1]) \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) + : [cm] "r"(cm), [dest_stride] "r"(dest_stride), + [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), + [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + + input += 32; + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c new file mode 100644 index 000000000..fa7703217 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c @@ -0,0 +1,1030 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +static void idct32_rows_dspr2(const int16_t *input, int16_t *output, + uint32_t no_rows) { + int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int16_t step1_28, step1_29, step1_30, step1_31; + int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int16_t step2_28, step2_29, step2_30, step2_31; + int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int16_t step3_29, step3_30, step3_31; + int temp0, temp1, temp2, temp3; + int load1, load2, load3, load4; + int result1, result2; + int temp21; + int i; + const int const_2_power_13 = 8192; + const int32_t *input_int; + + for (i = no_rows; i--;) { + input_int = (const int32_t *)input; + + if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | + input_int[4] | input_int[5] | input_int[6] | input_int[7] | + input_int[8] | input_int[9] | input_int[10] | input_int[11] | + input_int[12] | input_int[13] | input_int[14] | input_int[15])) { + input += 32; + + __asm__ __volatile__( + "sh $zero, 0(%[output]) \n\t" + "sh $zero, 64(%[output]) \n\t" + "sh $zero, 128(%[output]) \n\t" + "sh $zero, 192(%[output]) \n\t" + "sh $zero, 256(%[output]) \n\t" + "sh $zero, 320(%[output]) \n\t" + "sh $zero, 384(%[output]) \n\t" + "sh $zero, 448(%[output]) \n\t" + "sh $zero, 512(%[output]) \n\t" + "sh $zero, 576(%[output]) \n\t" + "sh $zero, 640(%[output]) \n\t" + "sh $zero, 704(%[output]) \n\t" + "sh $zero, 768(%[output]) \n\t" + "sh $zero, 832(%[output]) \n\t" + "sh $zero, 896(%[output]) \n\t" + "sh $zero, 960(%[output]) \n\t" + "sh $zero, 1024(%[output]) \n\t" + "sh $zero, 1088(%[output]) \n\t" + "sh $zero, 1152(%[output]) \n\t" + "sh $zero, 1216(%[output]) \n\t" + "sh $zero, 1280(%[output]) \n\t" + "sh $zero, 1344(%[output]) \n\t" + "sh $zero, 1408(%[output]) \n\t" + "sh $zero, 1472(%[output]) \n\t" + "sh $zero, 1536(%[output]) \n\t" + "sh $zero, 1600(%[output]) \n\t" + "sh $zero, 1664(%[output]) \n\t" + "sh $zero, 1728(%[output]) \n\t" + "sh $zero, 1792(%[output]) \n\t" + "sh $zero, 1856(%[output]) \n\t" + "sh $zero, 1920(%[output]) \n\t" + "sh $zero, 1984(%[output]) \n\t" + + : + : [output] "r"(output)); + + output += 1; + + continue; + } + + /* prefetch row */ + prefetch_load((const uint8_t *)(input + 32)); + prefetch_load((const uint8_t *)(input + 48)); + + __asm__ __volatile__( + "lh %[load1], 2(%[input]) \n\t" + "lh %[load2], 62(%[input]) \n\t" + "lh %[load3], 34(%[input]) \n\t" + "lh %[load4], 30(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_31_64] \n\t" + "msub $ac1, %[load2], %[cospi_1_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_1_64] \n\t" + "madd $ac3, %[load2], %[cospi_31_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_15_64] \n\t" + "msub $ac2, %[load4], %[cospi_17_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_17_64] \n\t" + "madd $ac1, %[load4], %[cospi_15_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load2], %[temp0], %[temp1] \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_17], $ac1, 31 \n\t" + "extp %[step1_30], $ac3, 31 \n\t" + "add %[step1_16], %[temp0], %[temp1] \n\t" + "add %[step1_31], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), + [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), + [step1_31] "=r"(step1_31) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), + [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 18(%[input]) \n\t" + "lh %[load2], 46(%[input]) \n\t" + "lh %[load3], 50(%[input]) \n\t" + "lh %[load4], 14(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_23_64] \n\t" + "msub $ac1, %[load2], %[cospi_9_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_9_64] \n\t" + "madd $ac3, %[load2], %[cospi_23_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_7_64] \n\t" + "msub $ac2, %[load4], %[cospi_25_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_25_64] \n\t" + "madd $ac1, %[load4], %[cospi_7_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "msub $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + + "extp %[step1_18], $ac1, 31 \n\t" + "extp %[step1_29], $ac3, 31 \n\t" + "add %[step1_19], %[temp0], %[temp1] \n\t" + "add %[step1_28], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), + [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), + [step1_29] "=r"(step1_29) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), + [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); + + __asm__ __volatile__( + "lh %[load1], 10(%[input]) \n\t" + "lh %[load2], 54(%[input]) \n\t" + "lh %[load3], 42(%[input]) \n\t" + "lh %[load4], 22(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_27_64] \n\t" + "msub $ac1, %[load2], %[cospi_5_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_5_64] \n\t" + "madd $ac3, %[load2], %[cospi_27_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_11_64] \n\t" + "msub $ac2, %[load4], %[cospi_21_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_21_64] \n\t" + "madd $ac1, %[load4], %[cospi_11_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "madd $ac1, %[load2], %[cospi_12_64] \n\t" + "msub $ac1, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load1], %[cospi_12_64] \n\t" + "madd $ac3, %[load2], %[cospi_20_64] \n\t" + + "extp %[step1_21], $ac1, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" + "add %[step1_20], %[temp0], %[temp1] \n\t" + "add %[step1_27], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), + [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), + [step1_27] "=r"(step1_27) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), + [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 26(%[input]) \n\t" + "lh %[load2], 38(%[input]) \n\t" + "lh %[load3], 58(%[input]) \n\t" + "lh %[load4], 6(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_19_64] \n\t" + "msub $ac1, %[load2], %[cospi_13_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_13_64] \n\t" + "madd $ac3, %[load2], %[cospi_19_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_3_64] \n\t" + "msub $ac2, %[load4], %[cospi_29_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_29_64] \n\t" + "madd $ac1, %[load4], %[cospi_3_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_12_64] \n\t" + "msub $ac1, %[load2], %[cospi_20_64] \n\t" + "msub $ac3, %[load1], %[cospi_20_64] \n\t" + "madd $ac3, %[load2], %[cospi_12_64] \n\t" + + "extp %[step1_22], $ac1, 31 \n\t" + "extp %[step1_25], $ac3, 31 \n\t" + "add %[step1_23], %[temp0], %[temp1] \n\t" + "add %[step1_24], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), + [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), + [step1_25] "=r"(step1_25) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), + [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), + [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); + + __asm__ __volatile__( + "lh %[load1], 4(%[input]) \n\t" + "lh %[load2], 60(%[input]) \n\t" + "lh %[load3], 36(%[input]) \n\t" + "lh %[load4], 28(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_30_64] \n\t" + "msub $ac1, %[load2], %[cospi_2_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_2_64] \n\t" + "madd $ac3, %[load2], %[cospi_30_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_14_64] \n\t" + "msub $ac2, %[load4], %[cospi_18_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_18_64] \n\t" + "madd $ac1, %[load4], %[cospi_14_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp0], %[temp1] \n\t" + "sub %[load2], %[temp3], %[temp2] \n\t" + + "msub $ac1, %[load1], %[cospi_8_64] \n\t" + "madd $ac1, %[load2], %[cospi_24_64] \n\t" + "madd $ac3, %[load1], %[cospi_24_64] \n\t" + "madd $ac3, %[load2], %[cospi_8_64] \n\t" + + "extp %[step2_9], $ac1, 31 \n\t" + "extp %[step2_14], $ac3, 31 \n\t" + "add %[step2_8], %[temp0], %[temp1] \n\t" + "add %[step2_15], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), + [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), + [step2_15] "=r"(step2_15) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), + [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "lh %[load1], 20(%[input]) \n\t" + "lh %[load2], 44(%[input]) \n\t" + "lh %[load3], 52(%[input]) \n\t" + "lh %[load4], 12(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_22_64] \n\t" + "msub $ac1, %[load2], %[cospi_10_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_10_64] \n\t" + "madd $ac3, %[load2], %[cospi_22_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_6_64] \n\t" + "msub $ac2, %[load4], %[cospi_26_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_26_64] \n\t" + "madd $ac1, %[load4], %[cospi_6_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp1], %[temp0] \n\t" + "sub %[load2], %[temp2], %[temp3] \n\t" + + "msub $ac1, %[load1], %[cospi_24_64] \n\t" + "msub $ac1, %[load2], %[cospi_8_64] \n\t" + "madd $ac3, %[load2], %[cospi_24_64] \n\t" + "msub $ac3, %[load1], %[cospi_8_64] \n\t" + + "extp %[step2_10], $ac1, 31 \n\t" + "extp %[step2_13], $ac3, 31 \n\t" + "add %[step2_11], %[temp0], %[temp1] \n\t" + "add %[step2_12], %[temp2], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), + [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), + [step2_13] "=r"(step2_13) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), + [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), + [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "sub %[temp0], %[step2_14], %[step2_13] \n\t" + "sub %[temp0], %[temp0], %[step2_9] \n\t" + "add %[temp0], %[temp0], %[step2_10] \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp1], %[step2_14], %[step2_13] \n\t" + "add %[temp1], %[temp1], %[step2_9] \n\t" + "sub %[temp1], %[temp1], %[step2_10] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "sub %[temp0], %[step2_15], %[step2_12] \n\t" + "sub %[temp0], %[temp0], %[step2_8] \n\t" + "add %[temp0], %[temp0], %[step2_11] \n\t" + "madd $ac2, %[temp0], %[cospi_16_64] \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sub %[temp1], %[step2_15], %[step2_12] \n\t" + "add %[temp1], %[temp1], %[step2_8] \n\t" + "sub %[temp1], %[temp1], %[step2_11] \n\t" + "madd $ac3, %[temp1], %[cospi_16_64] \n\t" + + "add %[step3_8], %[step2_8], %[step2_11] \n\t" + "add %[step3_9], %[step2_9], %[step2_10] \n\t" + "add %[step3_14], %[step2_13], %[step2_14] \n\t" + "add %[step3_15], %[step2_12], %[step2_15] \n\t" + + "extp %[step3_10], $ac0, 31 \n\t" + "extp %[step3_13], $ac1, 31 \n\t" + "extp %[step3_11], $ac2, 31 \n\t" + "extp %[step3_12], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), + [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), + [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), + [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), + [step3_15] "=r"(step3_15) + : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), + [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), + [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), + [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), + [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); + + step2_18 = step1_17 - step1_18; + step2_29 = step1_30 - step1_29; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + + : [step3_18] "=r"(step3_18) + : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), + [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; + step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_19 = step1_16 - step1_19; + step2_28 = step1_31 - step1_28; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" + "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + + : [step3_19] "=r"(step3_19) + : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), + [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; + step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_16 = step1_16 + step1_19; + step3_17 = step1_17 + step1_18; + step3_30 = step1_29 + step1_30; + step3_31 = step1_28 + step1_31; + + step2_20 = step1_23 - step1_20; + step2_27 = step1_24 - step1_27; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" + "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac0, 31 \n\t" + + : [step3_20] "=r"(step3_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; + step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step2_21 = step1_22 - step1_21; + step2_26 = step1_25 - step1_26; + + __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" + "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac1, 31 \n\t" + + : [step3_21] "=r"(step3_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64)); + + temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; + step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + step3_22 = step1_21 + step1_22; + step3_23 = step1_20 + step1_23; + step3_24 = step1_24 + step1_27; + step3_25 = step1_25 + step1_26; + + step2_16 = step3_16 + step3_23; + step2_17 = step3_17 + step3_22; + step2_18 = step3_18 + step3_21; + step2_19 = step3_19 + step3_20; + step2_20 = step3_19 - step3_20; + step2_21 = step3_18 - step3_21; + step2_22 = step3_17 - step3_22; + step2_23 = step3_16 - step3_23; + + step2_24 = step3_31 - step3_24; + step2_25 = step3_30 - step3_25; + step2_26 = step3_29 - step3_26; + step2_27 = step3_28 - step3_27; + step2_28 = step3_28 + step3_27; + step2_29 = step3_29 + step3_26; + step2_30 = step3_30 + step3_25; + step2_31 = step3_31 + step3_24; + + __asm__ __volatile__( + "lh %[load1], 0(%[input]) \n\t" + "lh %[load2], 32(%[input]) \n\t" + "lh %[load3], 16(%[input]) \n\t" + "lh %[load4], 48(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "add %[result1], %[load1], %[load2] \n\t" + "sub %[result2], %[load1], %[load2] \n\t" + "madd $ac1, %[result1], %[cospi_16_64] \n\t" + "madd $ac2, %[result2], %[cospi_16_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac3, %[load3], %[cospi_24_64] \n\t" + "msub $ac3, %[load4], %[cospi_8_64] \n\t" + "extp %[temp2], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "madd $ac1, %[load3], %[cospi_8_64] \n\t" + "madd $ac1, %[load4], %[cospi_24_64] \n\t" + "extp %[temp3], $ac1, 31 \n\t" + + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [result1] "=&r"(result1), + [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), + [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), + [step1_3] "=r"(step1_3) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64), + [cospi_8_64] "r"(cospi_8_64) + + ); + + __asm__ __volatile__( + "lh %[load1], 8(%[input]) \n\t" + "lh %[load2], 56(%[input]) \n\t" + "lh %[load3], 40(%[input]) \n\t" + "lh %[load4], 24(%[input]) \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "madd $ac1, %[load1], %[cospi_28_64] \n\t" + "msub $ac1, %[load2], %[cospi_4_64] \n\t" + "extp %[temp0], $ac1, 31 \n\t" + + "madd $ac3, %[load1], %[cospi_4_64] \n\t" + "madd $ac3, %[load2], %[cospi_28_64] \n\t" + "extp %[temp3], $ac3, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + + "madd $ac2, %[load3], %[cospi_12_64] \n\t" + "msub $ac2, %[load4], %[cospi_20_64] \n\t" + "extp %[temp1], $ac2, 31 \n\t" + + "madd $ac1, %[load3], %[cospi_20_64] \n\t" + "madd $ac1, %[load4], %[cospi_12_64] \n\t" + "extp %[temp2], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + + "sub %[load1], %[temp3], %[temp2] \n\t" + "sub %[load1], %[load1], %[temp0] \n\t" + "add %[load1], %[load1], %[temp1] \n\t" + + "sub %[load2], %[temp0], %[temp1] \n\t" + "sub %[load2], %[load2], %[temp2] \n\t" + "add %[load2], %[load2], %[temp3] \n\t" + + "madd $ac1, %[load1], %[cospi_16_64] \n\t" + "madd $ac3, %[load2], %[cospi_16_64] \n\t" + + "extp %[step1_5], $ac1, 31 \n\t" + "extp %[step1_6], $ac3, 31 \n\t" + "add %[step1_4], %[temp0], %[temp1] \n\t" + "add %[step1_7], %[temp3], %[temp2] \n\t" + + : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), + [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), + [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), + [step1_7] "=r"(step1_7) + : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), + [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_16_64] "r"(cospi_16_64)); + + step2_0 = step1_0 + step1_7; + step2_1 = step1_1 + step1_6; + step2_2 = step1_2 + step1_5; + step2_3 = step1_3 + step1_4; + step2_4 = step1_3 - step1_4; + step2_5 = step1_2 - step1_5; + step2_6 = step1_1 - step1_6; + step2_7 = step1_0 - step1_7; + + step1_0 = step2_0 + step3_15; + step1_1 = step2_1 + step3_14; + step1_2 = step2_2 + step3_13; + step1_3 = step2_3 + step3_12; + step1_4 = step2_4 + step3_11; + step1_5 = step2_5 + step3_10; + step1_6 = step2_6 + step3_9; + step1_7 = step2_7 + step3_8; + step1_8 = step2_7 - step3_8; + step1_9 = step2_6 - step3_9; + step1_10 = step2_5 - step3_10; + step1_11 = step2_4 - step3_11; + step1_12 = step2_3 - step3_12; + step1_13 = step2_2 - step3_13; + step1_14 = step2_1 - step3_14; + step1_15 = step2_0 - step3_15; + + __asm__ __volatile__( + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_20], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_20 + step2_27) * cospi_16_64; + step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_21], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) + : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), + [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_21 + step2_26) * cospi_16_64; + step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_22], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) + : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), + [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_22 + step2_25) * cospi_16_64; + step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + __asm__ __volatile__( + "sub %[temp0], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "extp %[step1_23], $ac0, 31 \n\t" + + : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) + : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), + [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); + + temp21 = (step2_23 + step2_24) * cospi_16_64; + step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + + // final stage + output[0 * 32] = step1_0 + step2_31; + output[1 * 32] = step1_1 + step2_30; + output[2 * 32] = step1_2 + step2_29; + output[3 * 32] = step1_3 + step2_28; + output[4 * 32] = step1_4 + step1_27; + output[5 * 32] = step1_5 + step1_26; + output[6 * 32] = step1_6 + step1_25; + output[7 * 32] = step1_7 + step1_24; + output[8 * 32] = step1_8 + step1_23; + output[9 * 32] = step1_9 + step1_22; + output[10 * 32] = step1_10 + step1_21; + output[11 * 32] = step1_11 + step1_20; + output[12 * 32] = step1_12 + step2_19; + output[13 * 32] = step1_13 + step2_18; + output[14 * 32] = step1_14 + step2_17; + output[15 * 32] = step1_15 + step2_16; + output[16 * 32] = step1_15 - step2_16; + output[17 * 32] = step1_14 - step2_17; + output[18 * 32] = step1_13 - step2_18; + output[19 * 32] = step1_12 - step2_19; + output[20 * 32] = step1_11 - step1_20; + output[21 * 32] = step1_10 - step1_21; + output[22 * 32] = step1_9 - step1_22; + output[23 * 32] = step1_8 - step1_23; + output[24 * 32] = step1_7 - step1_24; + output[25 * 32] = step1_6 - step1_25; + output[26 * 32] = step1_5 - step1_26; + output[27 * 32] = step1_4 - step1_27; + output[28 * 32] = step1_3 - step2_28; + output[29 * 32] = step1_2 - step2_29; + output[30 * 32] = step1_1 - step2_30; + output[31 * 32] = step1_0 - step2_31; + + input += 32; + output += 1; + } +} + +void aom_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + idct32_rows_dspr2(input, outptr, 32); + + // Columns + aom_idct32_cols_add_blk_dspr2(out, dest, dest_stride); +} + +void aom_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + DECLARE_ALIGNED(32, int16_t, out[32 * 32]); + int16_t *outptr = out; + uint32_t i; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + idct32_rows_dspr2(input, outptr, 8); + + outptr += 8; + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + for (i = 0; i < 31; ++i) { + outptr += 32; + + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 8(%[outptr]) \n\t" + "sw $zero, 12(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 24(%[outptr]) \n\t" + "sw $zero, 28(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 40(%[outptr]) \n\t" + "sw $zero, 44(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + } + + // Columns + aom_idct32_cols_add_blk_dspr2(out, dest, stride); +} + +void aom_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, + int stride) { + int r, out; + int32_t a1, absa1; + int32_t vector_a1; + int32_t t1, t2, t3, t4; + int32_t vector_1, vector_2, vector_3, vector_4; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 32 \n\t" + "sra %[a1], %[out], 6 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c new file mode 100644 index 000000000..e6d0367cd --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + + for (i = 4; i--;) { + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + + "add %[Temp1], %[step_1], %[step_2] \n\t" + "sh %[Temp1], 8(%[output]) \n\t" + + "sub %[Temp2], %[step_1], %[step_2] \n\t" + "sh %[Temp2], 16(%[output]) \n\t" + + "sub %[Temp3], %[step_0], %[step_3] \n\t" + "sh %[Temp3], 24(%[output]) \n\t" + + : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), + [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input)); + + input += 4; + output += 1; + } +} + +void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t step_0, step_1, step_2, step_3; + int Temp0, Temp1, Temp2, Temp3; + const int const_2_power_13 = 8192; + int i; + uint8_t *dest_pix; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 4; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[2]) * cospi_16_64; + step_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[2]) * cospi_16_64; + step_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 4(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "extp %[step_0], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "extp %[step_1], $ac1, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + /* + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + step_2 = dct_const_round_shift(temp1); + */ + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "extp %[step_2], $ac0, 31 \n\t" + + /* + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step_3 = dct_const_round_shift(temp2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[step_3], $ac1, 31 \n\t" + + /* + output[0] = step_0 + step_3; + output[4] = step_1 + step_2; + output[8] = step_1 - step_2; + output[12] = step_0 - step_3; + */ + "add %[Temp0], %[step_0], %[step_3] \n\t" + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_1], %[step_2] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step_0], %[step_3] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "addi %[Temp0], %[Temp0], 8 \n\t" + "sra %[Temp0], %[Temp0], 4 \n\t" + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), + [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), + [dest_pix] "+r"(dest_pix) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), + [dest_stride] "r"(dest_stride)); + + input += 4; + } +} + +void aom_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[4 * 4]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + : + : [pos] "r"(pos)); + + // Rows + aom_idct4_rows_dspr2(input, outptr); + + // Columns + aom_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void aom_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + int a1, absa1; + int r; + int32_t out; + int t2, vector_a1, vector_a; + uint32_t pos = 45; + int16_t input_dc = input[0]; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); + __asm__ __volatile__( + "addi %[out], %[out], 8 \n\t" + "sra %[a1], %[out], 4 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 0(%[dest]) \n\t" + "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 0(%[dest]) \n\t" + "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst4_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c new file mode 100644 index 000000000..0a20f76f2 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c @@ -0,0 +1,645 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/inv_txfm_dspr2.h" +#include "aom_dsp/txfm_common.h" + +#if HAVE_DSPR2 +void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + const int const_2_power_13 = 8192; + int Temp0, Temp1, Temp2, Temp3, Temp4; + int i; + + for (i = no_rows; i--;) { + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[Temp4], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[Temp4], %[Temp1] \n\t" + "sub %[step1_3], %[Temp4], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "sh %[Temp0], 0(%[output]) \n\t" + "add %[Temp1], %[step1_1], %[step1_6] \n\t" + "sh %[Temp1], 16(%[output]) \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "sh %[Temp0], 32(%[output]) \n\t" + "add %[Temp1], %[step1_3], %[step1_4] \n\t" + "sh %[Temp1], 48(%[output]) \n\t" + + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "sh %[Temp0], 64(%[output]) \n\t" + "sub %[Temp1], %[step1_2], %[step1_5] \n\t" + "sh %[Temp1], 80(%[output]) \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "sh %[Temp0], 96(%[output]) \n\t" + "sub %[Temp1], %[step1_0], %[step1_7] \n\t" + "sh %[Temp1], 112(%[output]) \n\t" + + : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), + [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), + [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), + [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), + [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_24_64] "r"(cospi_24_64), [output] "r"(output), + [input] "r"(input)); + + input += 8; + output += 1; + } +} + +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, + int dest_stride) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; + int Temp0, Temp1, Temp2, Temp3; + int i; + const int const_2_power_13 = 8192; + uint8_t *dest_pix; + uint8_t *cm = aom_ff_cropTbl; + + /* prefetch aom_ff_cropTbl */ + prefetch_load(aom_ff_cropTbl); + prefetch_load(aom_ff_cropTbl + 32); + prefetch_load(aom_ff_cropTbl + 64); + prefetch_load(aom_ff_cropTbl + 96); + prefetch_load(aom_ff_cropTbl + 128); + prefetch_load(aom_ff_cropTbl + 160); + prefetch_load(aom_ff_cropTbl + 192); + prefetch_load(aom_ff_cropTbl + 224); + + for (i = 0; i < 8; ++i) { + dest_pix = (dest + i); + + __asm__ __volatile__( + /* + temp_1 = (input[0] + input[4]) * cospi_16_64; + step2_0 = dct_const_round_shift(temp_1); + + temp_2 = (input[0] - input[4]) * cospi_16_64; + step2_1 = dct_const_round_shift(temp_2); + */ + "lh %[Temp0], 0(%[input]) \n\t" + "lh %[Temp1], 8(%[input]) \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "add %[Temp2], %[Temp0], %[Temp1] \n\t" + "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" + "extp %[step1_6], $ac0, 31 \n\t" + + "sub %[Temp3], %[Temp0], %[Temp1] \n\t" + "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + /* + temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; + step2_2 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 4(%[input]) \n\t" + "lh %[Temp1], 12(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "extp %[Temp3], $ac0, 31 \n\t" + + /* + step1_1 = step2_1 + step2_2; + step1_2 = step2_1 - step2_2; + */ + "add %[step1_1], %[Temp2], %[Temp3] \n\t" + "sub %[step1_2], %[Temp2], %[Temp3] \n\t" + + /* + temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; + step2_3 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" + "extp %[Temp1], $ac1, 31 \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + + /* + step1_0 = step2_0 + step2_3; + step1_3 = step2_0 - step2_3; + */ + "add %[step1_0], %[step1_6], %[Temp1] \n\t" + "sub %[step1_3], %[step1_6], %[Temp1] \n\t" + + /* + temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + step1_4 = dct_const_round_shift(temp_1); + */ + "lh %[Temp0], 2(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp1], 14(%[input]) \n\t" + "lh %[Temp0], 2(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" + "extp %[step1_4], $ac0, 31 \n\t" + + /* + temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1_7 = dct_const_round_shift(temp_2); + */ + "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" + "extp %[step1_7], $ac1, 31 \n\t" + + /* + temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + step1_5 = dct_const_round_shift(temp_1); + */ + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" + "extp %[step1_5], $ac0, 31 \n\t" + + /* + temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1_6 = dct_const_round_shift(temp_2); + */ + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "lh %[Temp0], 10(%[input]) \n\t" + "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" + "lh %[Temp1], 6(%[input]) \n\t" + "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* + temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; + temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; + */ + "sub %[Temp0], %[step1_7], %[step1_6] \n\t" + "sub %[Temp0], %[Temp0], %[step1_4] \n\t" + "add %[Temp0], %[Temp0], %[step1_5] \n\t" + "sub %[Temp1], %[step1_4], %[step1_5] \n\t" + "sub %[Temp1], %[Temp1], %[step1_6] \n\t" + "add %[Temp1], %[Temp1], %[step1_7] \n\t" + + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + + "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" + + /* + step1_4 = step1_4 + step1_5; + step1_7 = step1_6 + step1_7; + */ + "add %[step1_4], %[step1_4], %[step1_5] \n\t" + "add %[step1_7], %[step1_7], %[step1_6] \n\t" + + "extp %[step1_5], $ac0, 31 \n\t" + "extp %[step1_6], $ac1, 31 \n\t" + + /* add block */ + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "add %[Temp0], %[step1_0], %[step1_7] \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "add %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_3], %[step1_4] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_2], %[step1_5] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_1], %[step1_6] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "sub %[Temp0], %[step1_0], %[step1_7] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + + "lbu %[Temp1], 0(%[dest_pix]) \n\t" + "addi %[Temp0], %[Temp0], 16 \n\t" + "sra %[Temp0], %[Temp0], 5 \n\t" + "add %[Temp1], %[Temp1], %[Temp0] \n\t" + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "sb %[Temp2], 0(%[dest_pix]) \n\t" + + : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), + [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), + [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), + [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), + [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), + [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix) + : [const_2_power_13] "r"(const_2_power_13), + [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), + [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), + [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), + [dest_stride] "r"(dest_stride)); + + input += 8; + } +} + +void aom_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct8_rows_dspr2(input, outptr, 8); + + // Then transform columns and add to dest + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void aom_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + DECLARE_ALIGNED(32, int16_t, out[8 * 8]); + int16_t *outptr = out; + uint32_t pos = 45; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); + + // First transform rows + idct8_rows_dspr2(input, outptr, 4); + + outptr += 4; + + __asm__ __volatile__( + "sw $zero, 0(%[outptr]) \n\t" + "sw $zero, 4(%[outptr]) \n\t" + "sw $zero, 16(%[outptr]) \n\t" + "sw $zero, 20(%[outptr]) \n\t" + "sw $zero, 32(%[outptr]) \n\t" + "sw $zero, 36(%[outptr]) \n\t" + "sw $zero, 48(%[outptr]) \n\t" + "sw $zero, 52(%[outptr]) \n\t" + "sw $zero, 64(%[outptr]) \n\t" + "sw $zero, 68(%[outptr]) \n\t" + "sw $zero, 80(%[outptr]) \n\t" + "sw $zero, 84(%[outptr]) \n\t" + "sw $zero, 96(%[outptr]) \n\t" + "sw $zero, 100(%[outptr]) \n\t" + "sw $zero, 112(%[outptr]) \n\t" + "sw $zero, 116(%[outptr]) \n\t" + + : + : [outptr] "r"(outptr)); + + // Then transform columns and add to dest + idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); +} + +void aom_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, + int dest_stride) { + uint32_t pos = 45; + int32_t out; + int32_t r; + int32_t a1, absa1; + int32_t t1, t2, vector_a1, vector_1, vector_2; + + /* bit positon for extract from acc */ + __asm__ __volatile__("wrdsp %[pos], 1 \n\t" + + : + : [pos] "r"(pos)); + + out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); + __asm__ __volatile__( + "addi %[out], %[out], 16 \n\t" + "sra %[a1], %[out], 5 \n\t" + + : [out] "+r"(out), [a1] "=r"(a1) + :); + + if (a1 < 0) { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__( + "abs %[absa1], %[a1] \n\t" + "replv.qb %[vector_a1], %[absa1] \n\t" + + : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+&r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } else { + /* use quad-byte + * input and output memory are four byte aligned */ + __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" + + : [vector_a1] "=r"(vector_a1) + : [a1] "r"(a1)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[dest_stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+r"(dest) + : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + } + } +} + +void iadst8_dspr2(const int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + int x0, x1, x2, x3, x4, x5, x6, x7; + + x0 = input[7]; + x1 = input[0]; + x2 = input[5]; + x3 = input[2]; + x4 = input[3]; + x5 = input[4]; + x6 = input[1]; + x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = + output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); + x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); + x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); + x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); + x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); + x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); + x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); + x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); + + output[0] = x0; + output[1] = -x4; + output[2] = x6; + output[3] = -x2; + output[4] = x3; + output[5] = -x7; + output[6] = x5; + output[7] = -x1; +} +#endif // HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c new file mode 100644 index 000000000..fc0c32ce3 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c @@ -0,0 +1,1487 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_ports/mem.h" +#include "aom_dsp/mips/loopfilter_msa.h" + +int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 96); + + LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7); + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + src -= 3 * pitch; + ST_UB4(p2, p1, p0, q0, src, pitch); + src += (4 * pitch); + ST_UB2(q1, q2, src, pitch); + } else { + src -= 7 * pitch; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += pitch; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += pitch; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4); + + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += pitch; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += pitch; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += pitch; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += pitch; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += pitch; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += pitch; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + } +} + +void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); + uint8_t early_exit = 0; + + (void)count; + + early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr, + limit_ptr, thresh_ptr); + + if (0 == early_exit) { + aom_hz_lpf_t16_16w(src, pitch, filter48); + } +} + +static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, int32_t count) { + if (1 == count) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + uint64_t dword0, dword1; + v16u8 flat2, mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 p0_filter16, p1_filter16; + v8i16 p2_filter8, p1_filter8, p0_filter8; + v8i16 q0_filter8, q1_filter8, q2_filter8; + v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + v8u16 tmp0, tmp1, tmp2; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch); + } else { + /* convert 8 bit input data into 16 bit */ + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, + zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, + q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + /* load 16 vector elements */ + LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4); + LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch); + SD(q1_d, src + pitch); + SD(q2_d, src + 2 * pitch); + } else { + /* LSB(right) 8 pixel operation */ + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5, + zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r, + q7_r); + + tmp0 = p7_r << 3; + tmp0 -= p7_r; + tmp0 += p6_r; + tmp0 += q0_r; + + src -= 7 * pitch; + + /* calculation of p6 and p5 */ + tmp1 = p6_r + p5_r + p4_r + p3_r; + tmp1 += (p2_r + p1_r + p0_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp0 = p5_r - p6_r + q1_r - p7_r; + tmp1 += tmp0; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p4 and p3 */ + tmp0 = p4_r - p5_r + q2_r - p7_r; + tmp2 = p3_r - p4_r + q3_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p2 and p1 */ + tmp0 = p2_r - p3_r + q4_r - p7_r; + tmp2 = p1_r - p2_r + q5_r - p7_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of p0 and q0 */ + tmp0 = (p0_r - p1_r) + (q6_r - p7_r); + tmp2 = (q7_r - p0_r) + (q0_r - p7_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q1 and q2 */ + tmp0 = q7_r - q0_r + q1_r - p6_r; + tmp2 = q7_r - q1_r + q2_r - p5_r; + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q3 and q4 */ + tmp0 = (q7_r - q2_r) + (q3_r - p4_r); + tmp2 = (q7_r - q3_r) + (q4_r - p3_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + src += pitch; + + /* calculation of q5 and q6 */ + tmp0 = (q7_r - q4_r) + (q5_r - p2_r); + tmp2 = (q7_r - q5_r) + (q6_r - p1_r); + tmp1 += tmp0; + p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + tmp1 += tmp2; + p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4); + PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16, + p1_filter16); + p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2); + p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2); + dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0); + dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0); + SD(dword0, src); + src += pitch; + SD(dword1, src); + } + } + } else { + aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr, + thresh_ptr, count); + } +} + +void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); +} + +void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2); +} + +static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org; + v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, + p1_org, p0_org); + /* 8x8 transpose */ + TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, + p0_org, p7, p6, p5, p4, p3, p2, p1, p0); + /* 8x8 transpose */ + ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org, + tmp0, tmp1, tmp2, tmp3); + ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6); + ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7); + ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4); + ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6); + SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, + uint8_t *output, int32_t out_pitch) { + v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + + LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7); + TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, + q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o); + ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch); +} + +static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, + int32_t out_pitch) { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7; + v4i32 tmp2, tmp3; + + LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7); + input += (8 * in_pitch); + LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p7, p6, + p5, p4, p3, p2, p1, p0); + + /* transpose 16x8 matrix into 8x16 */ + /* total 8 intermediate register and 32 instructions */ + q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0); + q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1); + q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2); + q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3); + q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4); + q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5); + q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6); + q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7); + + ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1); + tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7); + tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5); + + ILVEV_B2_UB(q3, q2, q1, q0, q5, q7); + tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3); + tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1); + + ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3); + q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0); + tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5); + q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3); + q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4); + tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6); + q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2); + q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2); + + ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch); + output += (8 * out_pitch); + ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch); +} + +int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch_org, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org); + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + /* convert 16 bit output data into 8 bit */ + p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r); + p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r); + p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r); + q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r); + q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r); + q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16i8 zero = { 0 }; + v16u8 filter8, flat, flat2; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 tmp0_r, tmp1_r; + v8i16 r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST8x1_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST8x1_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST8x1_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST8x1_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST8x1_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST8x1_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST8x1_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST8x1_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST8x1_UB(q6, src); + + return 0; + } +} + +void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16); + + early_exit = + aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch); + } + } +} + +int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, + uint8_t *src_org, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16i8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + + /* load vector elements */ + LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src_org -= 2; + ST4x8_UB(vec2, vec3, src_org, pitch); + src_org += 8 * pitch; + ST4x8_UB(vec4, vec5, src_org, pitch); + + return 1; + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16); + filter48 += (4 * 16); + ST_UB2(q1_out, q2_out, filter48, 16); + filter48 += (2 * 16); + ST_UB(flat, filter48); + + return 0; + } +} + +int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch, + uint8_t *filter48) { + v16u8 flat, flat2, filter8; + v16i8 zero = { 0 }; + v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in; + v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in; + v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in; + v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in; + v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l; + v8i16 l_out, r_out; + + flat = LD_UB(filter48 + 6 * 16); + + LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0); + LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7); + + AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2); + + if (__msa_test_bz_v(flat2)) { + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + LD_UB4(filter48, 16, p2, p1, p0, q0); + LD_UB2(filter48 + 4 * 16, 16, q1, q2); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src_org -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec2, 4, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 0, (src_org + 4), pitch); + src_org += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch); + ST2x4_UB(vec5, 4, (src_org + 4), pitch); + + return 1; + } else { + src -= 7 * 16; + + ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero, + p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, + p2_r_in, p1_r_in, p0_r_in); + q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0); + + tmp0_r = p7_r_in << 3; + tmp0_r -= p7_r_in; + tmp0_r += p6_r_in; + tmp0_r += q0_r_in; + tmp1_r = p6_r_in + p5_r_in; + tmp1_r += p4_r_in; + tmp1_r += p3_r_in; + tmp1_r += p2_r_in; + tmp1_r += p1_r_in; + tmp1_r += p0_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + + ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in, + p5_l_in, p4_l_in); + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in, + p1_l_in, p0_l_in); + q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0); + + tmp0_l = p7_l_in << 3; + tmp0_l -= p7_l_in; + tmp0_l += p6_l_in; + tmp0_l += q0_l_in; + tmp1_l = p6_l_in + p5_l_in; + tmp1_l += p4_l_in; + tmp1_l += p3_l_in; + tmp1_l += p2_l_in; + tmp1_l += p1_l_in; + tmp1_l += p0_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2); + ST_UB(p6, src); + src += 16; + + /* p5 */ + q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1); + tmp0_r = p5_r_in - p6_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1); + tmp0_l = p5_l_in - p6_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2); + ST_UB(p5, src); + src += 16; + + /* p4 */ + q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2); + tmp0_r = p4_r_in - p5_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2); + tmp0_l = p4_l_in - p5_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2); + ST_UB(p4, src); + src += 16; + + /* p3 */ + q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3); + tmp0_r = p3_r_in - p4_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3); + tmp0_l = p3_l_in - p4_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2); + ST_UB(p3, src); + src += 16; + + /* p2 */ + q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4); + filter8 = LD_UB(filter48); + tmp0_r = p2_r_in - p3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4); + tmp0_l = p2_l_in - p3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p1 */ + q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5); + filter8 = LD_UB(filter48 + 16); + tmp0_r = p1_r_in - p2_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5); + tmp0_l = p1_l_in - p2_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)(tmp1_l), 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* p0 */ + q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6); + filter8 = LD_UB(filter48 + 32); + tmp0_r = p0_r_in - p1_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6); + tmp0_l = p0_l_in - p1_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q0 */ + q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7); + filter8 = LD_UB(filter48 + 48); + tmp0_r = q7_r_in - p0_r_in; + tmp0_r += q0_r_in; + tmp0_r -= p7_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7); + tmp0_l = q7_l_in - p0_l_in; + tmp0_l += q0_l_in; + tmp0_l -= p7_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q1 */ + filter8 = LD_UB(filter48 + 64); + tmp0_r = q7_r_in - q0_r_in; + tmp0_r += q1_r_in; + tmp0_r -= p6_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q0_l_in; + tmp0_l += q1_l_in; + tmp0_l -= p6_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q2 */ + filter8 = LD_UB(filter48 + 80); + tmp0_r = q7_r_in - q1_r_in; + tmp0_r += q2_r_in; + tmp0_r -= p5_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q1_l_in; + tmp0_l += q2_l_in; + tmp0_l -= p5_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2); + ST_UB(filter8, src); + src += 16; + + /* q3 */ + tmp0_r = q7_r_in - q2_r_in; + tmp0_r += q3_r_in; + tmp0_r -= p4_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q2_l_in; + tmp0_l += q3_l_in; + tmp0_l -= p4_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2); + ST_UB(q3, src); + src += 16; + + /* q4 */ + tmp0_r = q7_r_in - q3_r_in; + tmp0_r += q4_r_in; + tmp0_r -= p3_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q3_l_in; + tmp0_l += q4_l_in; + tmp0_l -= p3_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2); + ST_UB(q4, src); + src += 16; + + /* q5 */ + tmp0_r = q7_r_in - q4_r_in; + tmp0_r += q5_r_in; + tmp0_r -= p2_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q4_l_in; + tmp0_l += q5_l_in; + tmp0_l -= p2_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2); + ST_UB(q5, src); + src += 16; + + /* q6 */ + tmp0_r = q7_r_in - q5_r_in; + tmp0_r += q6_r_in; + tmp0_r -= p1_r_in; + tmp1_r += tmp0_r; + r_out = __msa_srari_h((v8i16)tmp1_r, 4); + tmp0_l = q7_l_in - q5_l_in; + tmp0_l += q6_l_in; + tmp0_l -= p1_l_in; + tmp1_l += tmp0_l; + l_out = __msa_srari_h((v8i16)tmp1_l, 4); + r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out); + q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2); + ST_UB(q6, src); + + return 0; + } +} + +void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint8_t early_exit = 0; + DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]); + uint8_t *filter48 = &transposed_input[16 * 16]; + + transpose_16x16((src - 8), pitch, &transposed_input[0], 16); + + early_exit = + aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src, + pitch, b_limit_ptr, limit_ptr, thresh_ptr); + + if (0 == early_exit) { + early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, + &filter48[0]); + + if (0 == early_exit) { + transpose_16x16(transposed_input, 16, (src - 8), pitch); + } + } +} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c new file mode 100644 index 000000000..dc0a97764 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/loopfilter_msa.h" + +void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint64_t p1_d, p0_d, q0_d, q1_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); +} + +void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + + ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch); +} + +void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 mask, hev, flat, limit, thresh, b_limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v8i16 vec0, vec1, vec2, vec3; + + LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); +} + +void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0_ptr, + const uint8_t *limit0_ptr, + const uint8_t *thresh0_ptr, + const uint8_t *b_limit1_ptr, + const uint8_t *limit1_ptr, + const uint8_t *thresh1_ptr) { + v16u8 mask, hev, flat; + v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + + LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7); + LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13, + row14, row15); + + TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8, + row9, row10, row11, row12, row13, row14, row15, p3, p2, + p1, p0, q0, q1, q2, q3); + + thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr); + thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr); + thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0); + + b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr); + b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr); + b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0); + + limit0 = (v16u8)__msa_fill_b(*limit0_ptr); + limit1 = (v16u8)__msa_fill_b(*limit1_ptr); + limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev, + mask, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3); + ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1); + ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5); + + src -= 2; + + ST4x8_UB(tmp2, tmp3, src, pitch); + src += (8 * pitch); + ST4x8_UB(tmp4, tmp5, src, pitch); +} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c new file mode 100644 index 000000000..dc203e79c --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/mips/loopfilter_msa.h" + +void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d; + v16u8 mask, hev, flat, thresh, b_limit, limit; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8; + v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r; + v16i8 zero = { 0 }; + + /* load vector elements */ + LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8, + p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero, + q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8); + PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat); + + p2_d = __msa_copy_u_d((v2i64)p2_out, 0); + p1_d = __msa_copy_u_d((v2i64)p1_out, 0); + p0_d = __msa_copy_u_d((v2i64)p0_out, 0); + q0_d = __msa_copy_u_d((v2i64)q0_out, 0); + q1_d = __msa_copy_u_d((v2i64)q1_out, 0); + q2_d = __msa_copy_u_d((v2i64)q2_out, 0); + + src -= 3 * pitch; + + SD4(p2_d, p1_d, p0_d, q0_d, src, pitch); + src += (4 * pitch); + SD(q1_d, src); + src += pitch; + SD(q2_d, src); + } +} + +void aom_lpf_horizontal_8_dual_msa( + uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out; + v16u8 flat, mask, hev, tmp, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + + /* load vector elements */ + LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + tmp = (v16u8)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + tmp = (v16u8)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + tmp = (v16u8)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + src -= 3 * pitch; + + ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch); + src += (4 * pitch); + ST_UB2(q1_out, q2_out, src, pitch); + src += (2 * pitch); + } +} + +void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4; + + /* load vector elements */ + LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3); + + TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = (v16u8)__msa_fill_b(*thresh_ptr); + b_limit = (v16u8)__msa_fill_b(*b_limit_ptr); + limit = (v16u8)__msa_fill_b(*limit_ptr); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); + + if (__msa_test_bz_v(flat)) { + /* Store 4 pixels p1-_q1 */ + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + + src -= 2; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + src += 4 * pitch; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r, + p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + /* Store 6 pixels p2-_q2 */ + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1); + + src -= 3; + ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec4, 4, src + 4, pitch); + } +} + +void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit0, const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *b_limit1, const uint8_t *limit1, + const uint8_t *thresh1) { + uint8_t *temp_src; + v16u8 p3, p2, p1, p0, q3, q2, q1, q0; + v16u8 p1_out, p0_out, q0_out, q1_out; + v16u8 flat, mask, hev, thresh, b_limit, limit; + v16u8 row4, row5, row6, row7, row12, row13, row14, row15; + v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r; + v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l; + v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r; + v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l; + v16u8 zero = { 0 }; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + + temp_src = src - 4; + + LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7); + temp_src += (8 * pitch); + LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15); + + /* transpose 16x8 matrix into 8x16 */ + TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0, + row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2, + q3); + + thresh = (v16u8)__msa_fill_b(*thresh0); + vec0 = (v8i16)__msa_fill_b(*thresh1); + thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh); + + b_limit = (v16u8)__msa_fill_b(*b_limit0); + vec0 = (v8i16)__msa_fill_b(*b_limit1); + b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit); + + limit = (v16u8)__msa_fill_b(*limit0); + vec0 = (v8i16)__msa_fill_b(*limit1); + limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit); + + /* mask and hev */ + LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, + mask, flat); + /* flat4 */ + AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat); + /* filter4 */ + AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + + if (__msa_test_bz_v(flat)) { + ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec2, vec3); + ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec4, vec5); + + src -= 2; + ST4x8_UB(vec2, vec3, src, pitch); + src += 8 * pitch; + ST4x8_UB(vec4, vec5, src, pitch); + } else { + ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero, + q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r); + AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r, + p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r); + + ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l); + ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l); + + /* filter8 */ + AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l, + p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l); + + /* convert 16 bit output data into 8 bit */ + PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l, + p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r, + p0_filt8_r, q0_filt8_r); + PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r, + q2_filt8_r); + + /* store pixel values */ + p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat); + p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat); + p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat); + q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat); + q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat); + q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat); + + ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec3, vec4); + ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1); + ILVRL_H2_SH(vec1, vec0, vec6, vec7); + ILVRL_B2_SH(q2, q1, vec2, vec5); + + src -= 3; + ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec2, 4, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 0, src + 4, pitch); + src += (4 * pitch); + ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch); + ST2x4_UB(vec5, 4, src + 4, pitch); + } +} diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c new file mode 100644 index 000000000..883d0523d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c @@ -0,0 +1,327 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask; + uint32_t hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + /* loop filter designed to work using chars so that we can make maximum use + of 8 bit simd instructions. */ + for (i = 0; i < 2; i++) { + sm1 = s - (pitch << 2); + s0 = sm1 + pitch; + s1 = s0 + pitch; + s2 = s - pitch; + s3 = s; + s4 = s + pitch; + s5 = s4 + pitch; + s6 = s5 + pitch; + + __asm__ __volatile__( + "lw %[p1], (%[s1]) \n\t" + "lw %[p2], (%[s2]) \n\t" + "lw %[p3], (%[s3]) \n\t" + "lw %[p4], (%[s4]) \n\t" + + : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + mask will be zero and filtering is not needed */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + __asm__ __volatile__( + "lw %[pm1], (%[sm1]) \n\t" + "lw %[p0], (%[s0]) \n\t" + "lw %[p5], (%[s5]) \n\t" + "lw %[p6], (%[s6]) \n\t" + + : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6) + : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6)); + + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + __asm__ __volatile__( + "sw %[p1], (%[s1]) \n\t" + "sw %[p2], (%[s2]) \n\t" + "sw %[p3], (%[s3]) \n\t" + "sw %[p4], (%[s4]) \n\t" + + : + : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4), + [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + } + } + + s = s + 4; + } +} + +void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev; + uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + /* load quad-byte vectors + * memory is 4 byte aligned + */ + p2 = *((uint32_t *)(s1 - 4)); + p6 = *((uint32_t *)(s1)); + p1 = *((uint32_t *)(s2 - 4)); + p5 = *((uint32_t *)(s2)); + p0 = *((uint32_t *)(s3 - 4)); + p4 = *((uint32_t *)(s3)); + pm1 = *((uint32_t *)(s4 - 4)); + p3 = *((uint32_t *)(s4)); + + /* transpose pm1, p0, p1, p2 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" + "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[pm1], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), + [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p3, p4, p5, p6 */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* if (p1 - p4 == 0) and (p2 - p3 == 0) + * mask will be zero and filtering is not needed + */ + if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { + filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5, + p6, thresh_vec, &hev, &mask); + + /* if mask == 0 do filtering is not needed */ + if (mask) { + /* filtering */ + filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); + + /* unpack processed 4x4 neighborhood + * don't use transpose on output data + * because memory isn't aligned + */ + __asm__ __volatile__( + "sb %[p4], 1(%[s4]) \n\t" + "sb %[p3], 0(%[s4]) \n\t" + "sb %[p2], -1(%[s4]) \n\t" + "sb %[p1], -2(%[s4]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s4] "r"(s4)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s3]) \n\t" + "sb %[p3], 0(%[s3]) \n\t" + "sb %[p2], -1(%[s3]) \n\t" + "sb %[p1], -2(%[s3]) \n\t" + + : [p1] "+r"(p1) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s2]) \n\t" + "sb %[p3], 0(%[s2]) \n\t" + "sb %[p2], -1(%[s2]) \n\t" + "sb %[p1], -2(%[s2]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s2] "r"(s2)); + + __asm__ __volatile__( + "srl %[p4], %[p4], 8 \n\t" + "srl %[p3], %[p3], 8 \n\t" + "srl %[p2], %[p2], 8 \n\t" + "srl %[p1], %[p1], 8 \n\t" + + : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1) + :); + + __asm__ __volatile__( + "sb %[p4], 1(%[s1]) \n\t" + "sb %[p3], 0(%[s1]) \n\t" + "sb %[p2], -1(%[s1]) \n\t" + "sb %[p1], -2(%[s1]) \n\t" + + : + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [s1] "r"(s1)); + } + } + } +} + +void aom_lpf_horizontal_4_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_horizontal_8_dual_dspr2( + uint8_t *s, int p /* pitch */, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, + const uint8_t *limit1, const uint8_t *thresh1) { + aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0); + aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1); +} + +void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); + aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h new file mode 100644 index 000000000..72df09823 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h @@ -0,0 +1,735 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#define AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +/* inputs & outputs are quad-byte vectors */ +static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1, + uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) { + int32_t aom_filter_l, aom_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (*ps0) ^ N128; + vps1 = (*ps1) ^ N128; + vqs0 = (*qs0) ^ N128; + vqs1 = (*qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* aom_filter &= hev; */ + "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" + + /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + + /* aom_filter &= mask; */ + "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" + + : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" + + /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); + + __asm__ __volatile__( + /* (aom_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* aom_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *ps0 = vps0 ^ N128; + *ps1 = vps1 ^ N128; + *qs0 = vqs0 ^ N128; + *qs1 = vqs1 ^ N128; +} + +static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1, + uint32_t ps0, uint32_t qs0, uint32_t qs1, + uint32_t *p1_f0, uint32_t *p0_f0, + uint32_t *q0_f0, uint32_t *q1_f0) { + int32_t aom_filter_l, aom_filter_r; + int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r; + int32_t subr_r, subr_l; + uint32_t t1, t2, HWM, t3; + uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r; + int32_t vps1, vps0, vqs0, vqs1; + int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r; + uint32_t N128; + + N128 = 0x80808080; + t1 = 0x03000300; + t2 = 0x04000400; + t3 = 0x01000100; + HWM = 0xFF00FF00; + + vps0 = (ps0) ^ N128; + vps1 = (ps1) ^ N128; + vqs0 = (qs0) ^ N128; + vqs1 = (qs1) ^ N128; + + /* use halfword pairs instead quad-bytes because of accuracy */ + vps0_l = vps0 & HWM; + vps0_r = vps0 << 8; + vps0_r = vps0_r & HWM; + + vps1_l = vps1 & HWM; + vps1_r = vps1 << 8; + vps1_r = vps1_r & HWM; + + vqs0_l = vqs0 & HWM; + vqs0_r = vqs0 << 8; + vqs0_r = vqs0_r & HWM; + + vqs1_l = vqs1 & HWM; + vqs1_r = vqs1 << 8; + vqs1_r = vqs1_r & HWM; + + mask_l = mask & HWM; + mask_r = mask << 8; + mask_r = mask_r & HWM; + + hev_l = hev & HWM; + hev_r = hev << 8; + hev_r = hev_r & HWM; + + __asm__ __volatile__( + /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */ + "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t" + "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t" + + /* qs0 - ps0 */ + "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t" + "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t" + + /* aom_filter &= hev; */ + "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t" + + /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */ + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_l], %[hev_l], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + "xor %[invhev_r], %[hev_r], %[HWM] \n\t" + "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t" + "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t" + + /* aom_filter &= mask; */ + "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t" + "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t" + + : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r), + [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r), + [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r) + : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l), + [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r), + [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l), + [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r), + [HWM] "r"(HWM)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + __asm__ __volatile__( + /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */ + "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t" + "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t" + + /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */ + "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t" + "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t" + "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t" + "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t" + + "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t" + "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t" + + "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t" + "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t" + + /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */ + "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t" + "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t" + + /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */ + "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t" + + : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r), + [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r), + [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l), + [vqs0_r] "+r"(vqs0_r) + : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), + [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r)); + + __asm__ __volatile__( + /* (aom_filter += 1) >>= 1 */ + "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t" + "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t" + + /* aom_filter &= ~hev; */ + "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t" + "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t" + + /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */ + "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t" + "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t" + + /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */ + "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t" + "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t" + + : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r), + [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l), + [vqs1_r] "+r"(vqs1_r) + : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r)); + + /* Create quad-bytes from halfword pairs */ + vqs0_l = vqs0_l & HWM; + vqs1_l = vqs1_l & HWM; + vps0_l = vps0_l & HWM; + vps1_l = vps1_l & HWM; + + __asm__ __volatile__( + "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t" + "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t" + "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t" + "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t" + + : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r), + [vqs0_r] "+r"(vqs0_r) + :); + + vqs0 = vqs0_l | vqs0_r; + vqs1 = vqs1_l | vqs1_r; + vps0 = vps0_l | vps0_r; + vps1 = vps1_l | vps1_r; + + *p0_f0 = vps0 ^ N128; + *p1_f0 = vps1 ^ N128; + *q0_f0 = vqs0 ^ N128; + *q1_f0 = vqs1 ^ N128; +} + +static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1, + uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); + + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; +} + +static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, uint32_t *op2_f1, + uint32_t *op1_f1, uint32_t *op0_f1, + uint32_t *oq0_f1, uint32_t *oq1_f1, + uint32_t *oq2_f1) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + uint32_t res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2; + uint32_t tmp; + uint32_t add_p210_q012; + uint32_t u32Four = 0x00040004; + + /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */ + /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */ + /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */ + /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */ + /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */ + /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */ + + __asm__ __volatile__( + "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t" + "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t" + + "shll.ph %[tmp], %[p3], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p3] \n\t" + "addu.ph %[res_op1], %[p3], %[p3] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op1], %[res_op1], %[p1] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q1] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q2] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q2] \n\t" + "shrl.ph %[res_op1], %[res_op1], 3 \n\t" + "shrl.ph %[res_op2], %[res_op2], 3 \n\t" + "addu.ph %[res_op0], %[p3], %[p0] \n\t" + "addu.ph %[res_oq0], %[q0], %[q3] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t" + "addu.ph %[res_oq1], %[q3], %[q3] \n\t" + "shll.ph %[tmp], %[q3], 1 \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t" + "addu.ph %[res_oq2], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t" + "shrl.ph %[res_op0], %[res_op0], 3 \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t" + + : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0), + [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2), + [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four)); + + *op2_f1 = res_op2; + *op1_f1 = res_op1; + *op0_f1 = res_op0; + *oq0_f1 = res_oq0; + *oq1_f1 = res_oq1; + *oq2_f1 = res_oq2; +} + +static INLINE void wide_mbfilter_dspr2( + uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3, + uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1, + uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6, + uint32_t *oq7) { + const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4; + const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; + uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0; + uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6; + uint32_t tmp; + uint32_t add_p6toq6; + uint32_t u32Eight = 0x00080008; + + __asm__ __volatile__( + /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6 + which is used most of the time */ + "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t" + "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t" + + : [add_p6toq6] "=&r"(add_p6toq6) + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), + [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [u32Eight] "r"(u32Eight)); + + __asm__ __volatile__( + /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + + p3 + p2 + p1 + p0 + q0, 4) */ + "shll.ph %[tmp], %[p7], 3 \n\t" + "subu.ph %[res_op6], %[tmp], %[p7] \n\t" + "addu.ph %[res_op6], %[res_op6], %[p6] \n\t" + "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q1] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q2] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q3] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q4] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q5] \n\t" + "subu.ph %[res_op6], %[res_op6], %[q6] \n\t" + "shrl.ph %[res_op6], %[res_op6], 4 \n\t" + + /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + + p2 + p1 + p0 + q0 + q1, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op5], %[tmp], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p7] \n\t" + "addu.ph %[res_op5], %[res_op5], %[p5] \n\t" + "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q2] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q3] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q4] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q5] \n\t" + "subu.ph %[res_op5], %[res_op5], %[q6] \n\t" + "shrl.ph %[res_op5], %[res_op5], 4 \n\t" + + /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + + p1 + p0 + q0 + q1 + q2, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op4], %[tmp], %[p7] \n\t" + "addu.ph %[res_op4], %[res_op4], %[p4] \n\t" + "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q3] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q4] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q5] \n\t" + "subu.ph %[res_op4], %[res_op4], %[q6] \n\t" + "shrl.ph %[res_op4], %[res_op4], 4 \n\t" + + /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + + p1 + p0 + q0 + q1 + q2 + q3, 4) */ + "shll.ph %[tmp], %[p7], 2 \n\t" + "addu.ph %[res_op3], %[tmp], %[p3] \n\t" + "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q4] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q5] \n\t" + "subu.ph %[res_op3], %[res_op3], %[q6] \n\t" + "shrl.ph %[res_op3], %[res_op3], 4 \n\t" + + /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + + p0 + q0 + q1 + q2 + q3 + q4, 4) */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op2], %[tmp], %[p7] \n\t" + "addu.ph %[res_op2], %[res_op2], %[p2] \n\t" + "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q5] \n\t" + "subu.ph %[res_op2], %[res_op2], %[q6] \n\t" + "shrl.ph %[res_op2], %[res_op2], 4 \n\t" + + /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + + p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */ + "shll.ph %[tmp], %[p7], 1 \n\t" + "addu.ph %[res_op1], %[tmp], %[p1] \n\t" + "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t" + "subu.ph %[res_op1], %[res_op1], %[q6] \n\t" + "shrl.ph %[res_op1], %[res_op1], 4 \n\t" + + /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */ + "addu.ph %[res_op0], %[p7], %[p0] \n\t" + "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t" + "shrl.ph %[res_op0], %[res_op0], 4 \n\t" + + : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5), + [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3), + [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1), + [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp) + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1), + [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), + [add_p6toq6] "r"(add_p6toq6)); + + *op6 = res_op6; + *op5 = res_op5; + *op4 = res_op4; + *op3 = res_op3; + *op2 = res_op2; + *op1 = res_op1; + *op0 = res_op0; + + __asm__ __volatile__( + /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */ + "addu.ph %[res_oq0], %[q7], %[q0] \n\t" + "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t" + "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t" + + /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq1], %[tmp], %[q1] \n\t" + "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t" + "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t" + "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t" + + /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + + q3 + q4 + q5 + q6 + q7 * 3, 4) */ + "shll.ph %[tmp], %[q7], 1 \n\t" + "addu.ph %[res_oq2], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t" + "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t" + "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t" + "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t" + + /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 + + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq3], %[tmp], %[q3] \n\t" + "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t" + "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t" + "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t" + + /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 + + q4 * 2 + q5 + q6 + q7 * 5, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq4], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t" + "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t" + "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t" + "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t" + + /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 + + q5 * 2 + q6 + q7 * 6, 4) */ + "shll.ph %[tmp], %[q7], 2 \n\t" + "addu.ph %[res_oq5], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t" + "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t" + "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t" + "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t" + + /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + + q4 + q5 + q6 * 2 + q7 * 7, 4) */ + "shll.ph %[tmp], %[q7], 3 \n\t" + "subu.ph %[res_oq6], %[tmp], %[q7] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t" + "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t" + "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t" + "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t" + + : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5), + [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3), + [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1), + [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp) + : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2), + [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6), + [add_p6toq6] "r"(add_p6toq6)); + + *oq0 = res_oq0; + *oq1 = res_oq1; + *oq2 = res_oq2; + *oq3 = res_oq3; + *oq4 = res_oq4; + *oq5 = res_oq5; + *oq6 = res_oq6; +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h new file mode 100644 index 000000000..3e6994714 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#define AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +#define STORE_F0() \ + { \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s4]) \n\t" \ + "sb %[q0_f0], 0(%[s4]) \n\t" \ + "sb %[p0_f0], -1(%[s4]) \n\t" \ + "sb %[p1_f0], -2(%[s4]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s3]) \n\t" \ + "sb %[q0_f0], 0(%[s3]) \n\t" \ + "sb %[p0_f0], -1(%[s3]) \n\t" \ + "sb %[p1_f0], -2(%[s3]) \n\t" \ + \ + : [p1_f0] "+r"(p1_f0) \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \ + [p0_f0] "r"(p0_f0)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s2]) \n\t" \ + "sb %[q0_f0], 0(%[s2]) \n\t" \ + "sb %[p0_f0], -1(%[s2]) \n\t" \ + "sb %[p1_f0], -2(%[s2]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q1_f0], %[q1_f0], 8 \n\t" \ + "srl %[q0_f0], %[q0_f0], 8 \n\t" \ + "srl %[p0_f0], %[p0_f0], 8 \n\t" \ + "srl %[p1_f0], %[p1_f0], 8 \n\t" \ + \ + : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \ + [p1_f0] "+r"(p1_f0) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q1_f0], 1(%[s1]) \n\t" \ + "sb %[q0_f0], 0(%[s1]) \n\t" \ + "sb %[p0_f0], -1(%[s1]) \n\t" \ + "sb %[p1_f0], -2(%[s1]) \n\t" \ + \ + : \ + : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \ + [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \ + } + +#define STORE_F1() \ + { \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + \ + : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \ + [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + \ + : \ + : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \ + [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + \ + : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \ + [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + \ + : \ + : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \ + [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \ + } + +#define STORE_F2() \ + { \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s4]) \n\t" \ + "sb %[q5_r], 5(%[s4]) \n\t" \ + "sb %[q4_r], 4(%[s4]) \n\t" \ + "sb %[q3_r], 3(%[s4]) \n\t" \ + "sb %[q2_r], 2(%[s4]) \n\t" \ + "sb %[q1_r], 1(%[s4]) \n\t" \ + "sb %[q0_r], 0(%[s4]) \n\t" \ + "sb %[p0_r], -1(%[s4]) \n\t" \ + "sb %[p1_r], -2(%[s4]) \n\t" \ + "sb %[p2_r], -3(%[s4]) \n\t" \ + "sb %[p3_r], -4(%[s4]) \n\t" \ + "sb %[p4_r], -5(%[s4]) \n\t" \ + "sb %[p5_r], -6(%[s4]) \n\t" \ + "sb %[p6_r], -7(%[s4]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_r], %[q6_r], 16 \n\t" \ + "srl %[q5_r], %[q5_r], 16 \n\t" \ + "srl %[q4_r], %[q4_r], 16 \n\t" \ + "srl %[q3_r], %[q3_r], 16 \n\t" \ + "srl %[q2_r], %[q2_r], 16 \n\t" \ + "srl %[q1_r], %[q1_r], 16 \n\t" \ + "srl %[q0_r], %[q0_r], 16 \n\t" \ + "srl %[p0_r], %[p0_r], 16 \n\t" \ + "srl %[p1_r], %[p1_r], 16 \n\t" \ + "srl %[p2_r], %[p2_r], 16 \n\t" \ + "srl %[p3_r], %[p3_r], 16 \n\t" \ + "srl %[p4_r], %[p4_r], 16 \n\t" \ + "srl %[p5_r], %[p5_r], 16 \n\t" \ + "srl %[p6_r], %[p6_r], 16 \n\t" \ + \ + : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \ + [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \ + [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \ + [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \ + [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_r], 6(%[s3]) \n\t" \ + "sb %[q5_r], 5(%[s3]) \n\t" \ + "sb %[q4_r], 4(%[s3]) \n\t" \ + "sb %[q3_r], 3(%[s3]) \n\t" \ + "sb %[q2_r], 2(%[s3]) \n\t" \ + "sb %[q1_r], 1(%[s3]) \n\t" \ + "sb %[q0_r], 0(%[s3]) \n\t" \ + "sb %[p0_r], -1(%[s3]) \n\t" \ + "sb %[p1_r], -2(%[s3]) \n\t" \ + "sb %[p2_r], -3(%[s3]) \n\t" \ + "sb %[p3_r], -4(%[s3]) \n\t" \ + "sb %[p4_r], -5(%[s3]) \n\t" \ + "sb %[p5_r], -6(%[s3]) \n\t" \ + "sb %[p6_r], -7(%[s3]) \n\t" \ + \ + : \ + : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \ + [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \ + [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \ + [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \ + [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s2]) \n\t" \ + "sb %[q5_l], 5(%[s2]) \n\t" \ + "sb %[q4_l], 4(%[s2]) \n\t" \ + "sb %[q3_l], 3(%[s2]) \n\t" \ + "sb %[q2_l], 2(%[s2]) \n\t" \ + "sb %[q1_l], 1(%[s2]) \n\t" \ + "sb %[q0_l], 0(%[s2]) \n\t" \ + "sb %[p0_l], -1(%[s2]) \n\t" \ + "sb %[p1_l], -2(%[s2]) \n\t" \ + "sb %[p2_l], -3(%[s2]) \n\t" \ + "sb %[p3_l], -4(%[s2]) \n\t" \ + "sb %[p4_l], -5(%[s2]) \n\t" \ + "sb %[p5_l], -6(%[s2]) \n\t" \ + "sb %[p6_l], -7(%[s2]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \ + \ + __asm__ __volatile__( \ + "srl %[q6_l], %[q6_l], 16 \n\t" \ + "srl %[q5_l], %[q5_l], 16 \n\t" \ + "srl %[q4_l], %[q4_l], 16 \n\t" \ + "srl %[q3_l], %[q3_l], 16 \n\t" \ + "srl %[q2_l], %[q2_l], 16 \n\t" \ + "srl %[q1_l], %[q1_l], 16 \n\t" \ + "srl %[q0_l], %[q0_l], 16 \n\t" \ + "srl %[p0_l], %[p0_l], 16 \n\t" \ + "srl %[p1_l], %[p1_l], 16 \n\t" \ + "srl %[p2_l], %[p2_l], 16 \n\t" \ + "srl %[p3_l], %[p3_l], 16 \n\t" \ + "srl %[p4_l], %[p4_l], 16 \n\t" \ + "srl %[p5_l], %[p5_l], 16 \n\t" \ + "srl %[p6_l], %[p6_l], 16 \n\t" \ + \ + : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \ + [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \ + [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \ + [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \ + [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \ + :); \ + \ + __asm__ __volatile__( \ + "sb %[q6_l], 6(%[s1]) \n\t" \ + "sb %[q5_l], 5(%[s1]) \n\t" \ + "sb %[q4_l], 4(%[s1]) \n\t" \ + "sb %[q3_l], 3(%[s1]) \n\t" \ + "sb %[q2_l], 2(%[s1]) \n\t" \ + "sb %[q1_l], 1(%[s1]) \n\t" \ + "sb %[q0_l], 0(%[s1]) \n\t" \ + "sb %[p0_l], -1(%[s1]) \n\t" \ + "sb %[p1_l], -2(%[s1]) \n\t" \ + "sb %[p2_l], -3(%[s1]) \n\t" \ + "sb %[p3_l], -4(%[s1]) \n\t" \ + "sb %[p4_l], -5(%[s1]) \n\t" \ + "sb %[p5_l], -6(%[s1]) \n\t" \ + "sb %[p6_l], -7(%[s1]) \n\t" \ + \ + : \ + : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \ + [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \ + [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \ + [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \ + [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \ + } + +#define PACK_LEFT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p3_l], %[p3] \n\t" \ + "preceu.ph.qbl %[p2_l], %[p2] \n\t" \ + "preceu.ph.qbl %[p1_l], %[p1] \n\t" \ + "preceu.ph.qbl %[p0_l], %[p0] \n\t" \ + "preceu.ph.qbl %[q0_l], %[q0] \n\t" \ + "preceu.ph.qbl %[q1_l], %[q1] \n\t" \ + "preceu.ph.qbl %[q2_l], %[q2] \n\t" \ + "preceu.ph.qbl %[q3_l], %[q3] \n\t" \ + \ + : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \ + [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \ + [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } + +#define PACK_LEFT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbl %[p7_l], %[p7] \n\t" \ + "preceu.ph.qbl %[p6_l], %[p6] \n\t" \ + "preceu.ph.qbl %[p5_l], %[p5] \n\t" \ + "preceu.ph.qbl %[p4_l], %[p4] \n\t" \ + "preceu.ph.qbl %[q4_l], %[q4] \n\t" \ + "preceu.ph.qbl %[q5_l], %[q5] \n\t" \ + "preceu.ph.qbl %[q6_l], %[q6] \n\t" \ + "preceu.ph.qbl %[q7_l], %[q7] \n\t" \ + \ + : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \ + [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \ + [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } + +#define PACK_RIGHT_0TO3() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p3_r], %[p3] \n\t" \ + "preceu.ph.qbr %[p2_r], %[p2] \n\t" \ + "preceu.ph.qbr %[p1_r], %[p1] \n\t" \ + "preceu.ph.qbr %[p0_r], %[p0] \n\t" \ + "preceu.ph.qbr %[q0_r], %[q0] \n\t" \ + "preceu.ph.qbr %[q1_r], %[q1] \n\t" \ + "preceu.ph.qbr %[q2_r], %[q2] \n\t" \ + "preceu.ph.qbr %[q3_r], %[q3] \n\t" \ + \ + : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \ + [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \ + [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \ + : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \ + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \ + } + +#define PACK_RIGHT_4TO7() \ + { \ + __asm__ __volatile__( \ + "preceu.ph.qbr %[p7_r], %[p7] \n\t" \ + "preceu.ph.qbr %[p6_r], %[p6] \n\t" \ + "preceu.ph.qbr %[p5_r], %[p5] \n\t" \ + "preceu.ph.qbr %[p4_r], %[p4] \n\t" \ + "preceu.ph.qbr %[q4_r], %[q4] \n\t" \ + "preceu.ph.qbr %[q5_r], %[q5] \n\t" \ + "preceu.ph.qbr %[q6_r], %[q6] \n\t" \ + "preceu.ph.qbr %[q7_r], %[q7] \n\t" \ + \ + : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \ + [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \ + [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \ + : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \ + [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \ + } + +#define COMBINE_LEFT_RIGHT_0TO2() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \ + "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \ + "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \ + "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \ + "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \ + "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \ + \ + : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \ + [q1] "=&r"(q1), [q2] "=&r"(q2) \ + : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \ + [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \ + [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \ + [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \ + } + +#define COMBINE_LEFT_RIGHT_3TO6() \ + { \ + __asm__ __volatile__( \ + "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \ + "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \ + "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \ + "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \ + "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \ + "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \ + "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \ + "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \ + \ + : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \ + [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \ + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \ + [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \ + [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \ + [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \ + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \ + [q6_r] "r"(q6_r)); \ + } + +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h new file mode 100644 index 000000000..8db3e521f --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#define AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_mem/aom_mem.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if HAVE_DSPR2 +/* processing 4 pixels at the same time + * compute hev and mask in the same function */ +static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit, + uint32_t p1, uint32_t p0, uint32_t p3, + uint32_t p2, uint32_t q0, uint32_t q1, + uint32_t q2, uint32_t q3, + uint32_t thresh, uint32_t *hev, + uint32_t *mask) { + uint32_t c, r, r3, r_k; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t hev1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; +} + +static INLINE void filter_hev_mask_flatmask4_dspr2( + uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0, + uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) { + uint32_t c, r, r3, r_k, r_flat; + uint32_t s1, s2, s3; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t hev1; + uint32_t flat1; + + __asm__ __volatile__( + /* mask |= (abs(p3 - p2) > limit) */ + "subu_s.qb %[c], %[p3], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* mask |= (abs(p2 - p1) > limit) */ + "subu_s.qb %[c], %[p2], %[p1] \n\t" + "subu_s.qb %[r_k], %[p1], %[p2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + /* mask |= (abs(p1 - p0) > limit) + * hev |= (abs(p1 - p0) > thresh) + * flat |= (abs(p1 - p0) > thresh) + */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], $0, %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* mask |= (abs(q1 - q0) > limit) + * hev |= (abs(q1 - q0) > thresh) + * flat |= (abs(q1 - q0) > thresh) + */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t" + "or %[r3], %[r3], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + /* look at stall here */ + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + + /* mask |= (abs(q2 - q1) > limit) */ + "subu_s.qb %[c], %[q2], %[q1] \n\t" + "subu_s.qb %[r_k], %[q1], %[q2] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r3], %[r3], 24 \n\t" + + /* mask |= (abs(q3 - q2) > limit) */ + "subu_s.qb %[c], %[q3], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3), + [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1) + : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), + [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3), + [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); + + __asm__ __volatile__( + /* abs(p0 - q0) */ + "subu_s.qb %[c], %[p0], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[p0] \n\t" + "wrdsp %[r3] \n\t" + "or %[s1], %[r_k], %[c] \n\t" + + /* abs(p1 - q1) */ + "subu_s.qb %[c], %[p1], %[q1] \n\t" + "addu_s.qb %[s3], %[s1], %[s1] \n\t" + "pick.qb %[hev1], %[ones], $0 \n\t" + "subu_s.qb %[r_k], %[q1], %[p1] \n\t" + "or %[s2], %[r_k], %[c] \n\t" + + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */ + "shrl.qb %[s2], %[s2], 1 \n\t" + "addu_s.qb %[s1], %[s2], %[s3] \n\t" + "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + + "wrdsp %[r] \n\t" + "pick.qb %[s2], $0, %[ones] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1), + [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3) + : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1), + [ones] "r"(ones), [flimit] "r"(flimit)); + + *hev = hev1; + *mask = s2; + *flat = flat1; +} + +static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, + uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2, + uint32_t q3, uint32_t q4, uint32_t *flat2) { + uint32_t c, r, r_k, r_flat; + uint32_t ones = 0xFFFFFFFF; + uint32_t flat_thresh = 0x01010101; + uint32_t flat1, flat3; + + __asm__ __volatile__( + /* flat |= (abs(p4 - p0) > thresh) */ + "subu_s.qb %[c], %[p4], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], $0, %[c] \n\t" + + /* flat |= (abs(q4 - q0) > thresh) */ + "subu_s.qb %[c], %[q4], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q4] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r], %[r], %[c] \n\t" + "sll %[r], %[r], 24 \n\t" + "wrdsp %[r] \n\t" + "pick.qb %[flat3], $0, %[ones] \n\t" + + /* flat |= (abs(p1 - p0) > thresh) */ + "subu_s.qb %[c], %[p1], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], $0, %[c] \n\t" + + /* flat |= (abs(q1 - q0) > thresh) */ + "subu_s.qb %[c], %[q1], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q1] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p0 - p2) > thresh) */ + "subu_s.qb %[c], %[p0], %[p2] \n\t" + "subu_s.qb %[r_k], %[p2], %[p0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q0 - q2) > thresh) */ + "subu_s.qb %[c], %[q0], %[q2] \n\t" + "subu_s.qb %[r_k], %[q2], %[q0] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(p3 - p0) > thresh) */ + "subu_s.qb %[c], %[p3], %[p0] \n\t" + "subu_s.qb %[r_k], %[p0], %[p3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + + /* flat |= (abs(q3 - q0) > thresh) */ + "subu_s.qb %[c], %[q3], %[q0] \n\t" + "subu_s.qb %[r_k], %[q0], %[q3] \n\t" + "or %[r_k], %[r_k], %[c] \n\t" + "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t" + "or %[r_flat], %[r_flat], %[c] \n\t" + "sll %[r_flat], %[r_flat], 24 \n\t" + "wrdsp %[r_flat] \n\t" + "pick.qb %[flat1], $0, %[ones] \n\t" + /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */ + "and %[flat1], %[flat3], %[flat1] \n\t" + + : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat), + [flat1] "=&r"(flat1), [flat3] "=&r"(flat3) + : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), + [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4), + [flat_thresh] "r"(flat_thresh), [ones] "r"(ones)); + + *flat2 = flat1; +} +#endif // #if HAVE_DSPR2 +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c new file mode 100644 index 000000000..a3b5a9eb1 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c @@ -0,0 +1,589 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint32_t mask; + uint32_t hev, flat; + uint8_t i; + uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p3, p2, p1, p0, q0, q1, q2, q3; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + for (i = 0; i < 2; i++) { + sp3 = s - (pitch << 2); + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + + __asm__ __volatile__( + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0)); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if ((flat != 0) && (mask != 0)) { + /* filtering */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } + + s = s + 4; + } +} + +void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p3, p2, p1, p0, q3, q2, q1, q0; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; + uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + if ((flat == 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((mask & flat) == 0xFFFFFFFF) { + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat != 0) && (mask != 0)) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0), + [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0), + [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c new file mode 100644 index 000000000..8d2fd69f7 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -0,0 +1,734 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, int count) { + uint32_t mask; + uint32_t hev, flat, flat2; + uint8_t i; + uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0; + uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + /* prefetch data for store */ + prefetch_store(s); + + for (i = 0; i < (2 * count); i++) { + sp7 = s - (pitch << 3); + sp6 = sp7 + pitch; + sp5 = sp6 + pitch; + sp4 = sp5 + pitch; + sp3 = sp4 + pitch; + sp2 = sp3 + pitch; + sp1 = sp2 + pitch; + sp0 = sp1 + pitch; + sq0 = s; + sq1 = s + pitch; + sq2 = sq1 + pitch; + sq3 = sq2 + pitch; + sq4 = sq3 + pitch; + sq5 = sq4 + pitch; + sq6 = sq5 + pitch; + sq7 = sq6 + pitch; + + __asm__ __volatile__( + "lw %[p7], (%[sp7]) \n\t" + "lw %[p6], (%[sp6]) \n\t" + "lw %[p5], (%[sp5]) \n\t" + "lw %[p4], (%[sp4]) \n\t" + "lw %[p3], (%[sp3]) \n\t" + "lw %[p2], (%[sp2]) \n\t" + "lw %[p1], (%[sp1]) \n\t" + "lw %[p0], (%[sp0]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7)); + + __asm__ __volatile__( + "lw %[q0], (%[sq0]) \n\t" + "lw %[q1], (%[sq1]) \n\t" + "lw %[q2], (%[sq2]) \n\t" + "lw %[q3], (%[sq3]) \n\t" + "lw %[q4], (%[sq4]) \n\t" + "lw %[q5], (%[sq5]) \n\t" + "lw %[q6], (%[sq6]) \n\t" + "lw %[q7], (%[sq7]) \n\t" + + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0), + [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7)); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + __asm__ __volatile__( + "sw %[p1_f0], (%[sp1]) \n\t" + "sw %[p0_f0], (%[sp0]) \n\t" + "sw %[q0_f0], (%[sq0]) \n\t" + "sw %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1)); + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + COMBINE_LEFT_RIGHT_0TO2() + COMBINE_LEFT_RIGHT_3TO6() + + __asm__ __volatile__( + "sw %[p6], (%[sp6]) \n\t" + "sw %[p5], (%[sp5]) \n\t" + "sw %[p4], (%[sp4]) \n\t" + "sw %[p3], (%[sp3]) \n\t" + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + + : + : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), + [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6), + [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sw %[q6], (%[sq6]) \n\t" + "sw %[q5], (%[sq5]) \n\t" + "sw %[q4], (%[sq4]) \n\t" + "sw %[q3], (%[sq3]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + + : + : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3), + [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6), + [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2), + [sq1] "r"(sq1), [sq0] "r"(sq0)); + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + COMBINE_LEFT_RIGHT_0TO2() + + __asm__ __volatile__( + "sw %[p2], (%[sp2]) \n\t" + "sw %[p1], (%[sp1]) \n\t" + "sw %[p0], (%[sp0]) \n\t" + "sw %[q0], (%[sq0]) \n\t" + "sw %[q1], (%[sq1]) \n\t" + "sw %[q2], (%[sq2]) \n\t" + + : + : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), + [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1), + [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0+f1 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), + [sq1] "r"(sq1), [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 + f2 */ + /* f0 function */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* f1 function */ + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); + + /* f2 function */ + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__( + "sb %[p6_r], (%[sp6]) \n\t" + "sb %[p5_r], (%[sp5]) \n\t" + "sb %[p4_r], (%[sp4]) \n\t" + "sb %[p3_r], (%[sp3]) \n\t" + "sb %[p2_r], (%[sp2]) \n\t" + "sb %[p1_r], (%[sp1]) \n\t" + "sb %[p0_r], (%[sp0]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), + [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_r], (%[sq0]) \n\t" + "sb %[q1_r], (%[sq1]) \n\t" + "sb %[q2_r], (%[sq2]) \n\t" + "sb %[q3_r], (%[sq3]) \n\t" + "sb %[q4_r], (%[sq4]) \n\t" + "sb %[q5_r], (%[sq5]) \n\t" + "sb %[q6_r], (%[sq6]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r_f1], (%[sp2]) \n\t" + "sb %[p1_r_f1], (%[sp1]) \n\t" + "sb %[p0_r_f1], (%[sp0]) \n\t" + "sb %[q0_r_f1], (%[sq0]) \n\t" + "sb %[q1_r_f1], (%[sq1]) \n\t" + "sb %[q2_r_f1], (%[sq2]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], (%[sp1]) \n\t" + "sb %[p0_f0], (%[sp0]) \n\t" + "sb %[q0_f0], (%[sq0]) \n\t" + "sb %[q1_f0], (%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r), + [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), + [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p6_r], +1(%[sp6]) \n\t" + "sb %[p5_r], +1(%[sp5]) \n\t" + "sb %[p4_r], +1(%[sp4]) \n\t" + "sb %[p3_r], +1(%[sp3]) \n\t" + "sb %[p2_r], +1(%[sp2]) \n\t" + "sb %[p1_r], +1(%[sp1]) \n\t" + "sb %[p0_r], +1(%[sp0]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_r], +1(%[sq0]) \n\t" + "sb %[q1_r], +1(%[sq1]) \n\t" + "sb %[q2_r], +1(%[sq2]) \n\t" + "sb %[q3_r], +1(%[sq3]) \n\t" + "sb %[q4_r], +1(%[sq4]) \n\t" + "sb %[q5_r], +1(%[sq5]) \n\t" + "sb %[q6_r], +1(%[sq6]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r_f1], +1(%[sp2]) \n\t" + "sb %[p1_r_f1], +1(%[sp1]) \n\t" + "sb %[p0_r_f1], +1(%[sp0]) \n\t" + "sb %[q0_r_f1], +1(%[sq0]) \n\t" + "sb %[q1_r_f1], +1(%[sq1]) \n\t" + "sb %[q2_r_f1], +1(%[sq2]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], +1(%[sp1]) \n\t" + "sb %[p0_f0], +1(%[sp0]) \n\t" + "sb %[q0_f0], +1(%[sq0]) \n\t" + "sb %[q1_f0], +1(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p6_l], +2(%[sp6]) \n\t" + "sb %[p5_l], +2(%[sp5]) \n\t" + "sb %[p4_l], +2(%[sp4]) \n\t" + "sb %[p3_l], +2(%[sp3]) \n\t" + "sb %[p2_l], +2(%[sp2]) \n\t" + "sb %[p1_l], +2(%[sp1]) \n\t" + "sb %[p0_l], +2(%[sp0]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_l], +2(%[sq0]) \n\t" + "sb %[q1_l], +2(%[sq1]) \n\t" + "sb %[q2_l], +2(%[sq2]) \n\t" + "sb %[q3_l], +2(%[sq3]) \n\t" + "sb %[q4_l], +2(%[sq4]) \n\t" + "sb %[q5_l], +2(%[sq5]) \n\t" + "sb %[q6_l], +2(%[sq6]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), + [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6)); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l_f1], +2(%[sp2]) \n\t" + "sb %[p1_l_f1], +2(%[sp1]) \n\t" + "sb %[p0_l_f1], +2(%[sp0]) \n\t" + "sb %[q0_l_f1], +2(%[sq0]) \n\t" + "sb %[q1_l_f1], +2(%[sq1]) \n\t" + "sb %[q2_l_f1], +2(%[sq2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], +2(%[sp1]) \n\t" + "sb %[p0_f0], +2(%[sp0]) \n\t" + "sb %[q0_f0], +2(%[sq0]) \n\t" + "sb %[q1_f0], +2(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + + __asm__ __volatile__( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); + + __asm__ __volatile__( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__( + "sb %[p6_l], +3(%[sp6]) \n\t" + "sb %[p5_l], +3(%[sp5]) \n\t" + "sb %[p4_l], +3(%[sp4]) \n\t" + "sb %[p3_l], +3(%[sp3]) \n\t" + "sb %[p2_l], +3(%[sp2]) \n\t" + "sb %[p1_l], +3(%[sp1]) \n\t" + "sb %[p0_l], +3(%[sp0]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), + [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0)); + + __asm__ __volatile__( + "sb %[q0_l], +3(%[sq0]) \n\t" + "sb %[q1_l], +3(%[sq1]) \n\t" + "sb %[q2_l], +3(%[sq2]) \n\t" + "sb %[q3_l], +3(%[sq3]) \n\t" + "sb %[q4_l], +3(%[sq4]) \n\t" + "sb %[q5_l], +3(%[sq5]) \n\t" + "sb %[q6_l], +3(%[sq6]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3), + [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6)); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l_f1], +3(%[sp2]) \n\t" + "sb %[p1_l_f1], +3(%[sp1]) \n\t" + "sb %[p0_l_f1], +3(%[sp0]) \n\t" + "sb %[q0_l_f1], +3(%[sq0]) \n\t" + "sb %[q1_l_f1], +3(%[sq1]) \n\t" + "sb %[q2_l_f1], +3(%[sq2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2), + [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), + [sq2] "r"(sq2)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], +3(%[sp1]) \n\t" + "sb %[p0_f0], +3(%[sp0]) \n\t" + "sb %[q0_f0], +3(%[sq0]) \n\t" + "sb %[q1_f0], +3(%[sq1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), + [sq0] "r"(sq0), [sq1] "r"(sq1)); + } + } + + s = s + 4; + } +} + +void aom_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); +} + +void aom_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2); +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c new file mode 100644 index 000000000..28528869b --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c @@ -0,0 +1,757 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/mips/common_dspr2.h" +#include "aom_dsp/mips/loopfilter_filters_dspr2.h" +#include "aom_dsp/mips/loopfilter_macros_dspr2.h" +#include "aom_dsp/mips/loopfilter_masks_dspr2.h" +#include "aom_mem/aom_mem.h" + +#if HAVE_DSPR2 +void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8_t i; + uint32_t mask, hev, flat, flat2; + uint8_t *s1, *s2, *s3, *s4; + uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + uint32_t thresh_vec, flimit_vec, limit_vec; + uint32_t uflimit, ulimit, uthresh; + uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7; + uint32_t p1_f0, p0_f0, q0_f0, q1_f0; + uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l; + uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l; + uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r; + uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r; + uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1; + uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1; + + uflimit = *blimit; + ulimit = *limit; + uthresh = *thresh; + + /* create quad-byte */ + __asm__ __volatile__( + "replv.qb %[thresh_vec], %[uthresh] \n\t" + "replv.qb %[flimit_vec], %[uflimit] \n\t" + "replv.qb %[limit_vec], %[ulimit] \n\t" + + : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec), + [limit_vec] "=r"(limit_vec) + : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit)); + + prefetch_store(s + pitch); + + for (i = 0; i < 2; i++) { + s1 = s; + s2 = s + pitch; + s3 = s2 + pitch; + s4 = s3 + pitch; + s = s4 + pitch; + + __asm__ __volatile__( + "lw %[p0], -4(%[s1]) \n\t" + "lw %[p1], -4(%[s2]) \n\t" + "lw %[p2], -4(%[s3]) \n\t" + "lw %[p3], -4(%[s4]) \n\t" + "lw %[p4], -8(%[s1]) \n\t" + "lw %[p5], -8(%[s2]) \n\t" + "lw %[p6], -8(%[s3]) \n\t" + "lw %[p7], -8(%[s4]) \n\t" + + : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), + [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + __asm__ __volatile__( + "lw %[q3], (%[s1]) \n\t" + "lw %[q2], (%[s2]) \n\t" + "lw %[q1], (%[s3]) \n\t" + "lw %[q0], (%[s4]) \n\t" + "lw %[q7], +4(%[s1]) \n\t" + "lw %[q6], +4(%[s2]) \n\t" + "lw %[q5], +4(%[s3]) \n\t" + "lw %[q4], +4(%[s4]) \n\t" + + : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0), + [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4) + : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4)); + + /* transpose p3, p2, p1, p0 + original (when loaded from memory) + register -4 -3 -2 -1 + p0 p0_0 p0_1 p0_2 p0_3 + p1 p1_0 p1_1 p1_2 p1_3 + p2 p2_0 p2_1 p2_2 p2_3 + p3 p3_0 p3_1 p3_2 p3_3 + + after transpose + register + p0 p3_3 p2_3 p1_3 p0_3 + p1 p3_2 p2_2 p1_2 p0_2 + p2 p3_1 p2_1 p1_1 p0_1 + p3 p3_0 p2_0 p1_0 p0_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" + "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" + "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" + "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" + + "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" + "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" + "append %[p1], %[sec3], 16 \n\t" + "append %[p3], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2), + [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q0, q1, q2, q3 + original (when loaded from memory) + register +1 +2 +3 +4 + q3 q3_0 q3_1 q3_2 q3_3 + q2 q2_0 q2_1 q2_2 q2_3 + q1 q1_0 q1_1 q1_2 q1_3 + q0 q0_0 q0_1 q0_2 q0_3 + + after transpose + register + q3 q0_3 q1_3 q2_3 q3_3 + q2 q0_2 q1_2 q2_2 q3_2 + q1 q0_1 q1_1 q2_1 q3_1 + q0 q0_0 q1_0 q2_0 q3_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" + "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" + "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" + "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" + + "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" + "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" + "append %[q2], %[sec3], 16 \n\t" + "append %[q0], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1), + [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose p7, p6, p5, p4 + original (when loaded from memory) + register -8 -7 -6 -5 + p4 p4_0 p4_1 p4_2 p4_3 + p5 p5_0 p5_1 p5_2 p5_3 + p6 p6_0 p6_1 p6_2 p6_3 + p7 p7_0 p7_1 p7_2 p7_3 + + after transpose + register + p4 p7_3 p6_3 p5_3 p4_3 + p5 p7_2 p6_2 p5_2 p4_2 + p6 p7_1 p6_1 p5_1 p4_1 + p7 p7_0 p6_0 p5_0 p4_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t" + "precr.qb.ph %[prim2], %[p4], %[p5] \n\t" + "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t" + "precr.qb.ph %[prim4], %[p6], %[p7] \n\t" + + "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[p4], %[p5], %[sec3] \n\t" + "precrq.ph.w %[p6], %[p7], %[sec4] \n\t" + "append %[p5], %[sec3], 16 \n\t" + "append %[p7], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6), + [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + /* transpose q4, q5, q6, q7 + original (when loaded from memory) + register +5 +6 +7 +8 + q7 q7_0 q7_1 q7_2 q7_3 + q6 q6_0 q6_1 q6_2 q6_3 + q5 q5_0 q5_1 q5_2 q5_3 + q4 q4_0 q4_1 q4_2 q4_3 + + after transpose + register + q7 q4_3 q5_3 q26_3 q7_3 + q6 q4_2 q5_2 q26_2 q7_2 + q5 q4_1 q5_1 q26_1 q7_1 + q4 q4_0 q5_0 q26_0 q7_0 + */ + __asm__ __volatile__( + "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t" + "precr.qb.ph %[prim2], %[q7], %[q6] \n\t" + "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t" + "precr.qb.ph %[prim4], %[q5], %[q4] \n\t" + + "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t" + "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t" + "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" + "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" + + "precrq.ph.w %[q7], %[q6], %[sec3] \n\t" + "precrq.ph.w %[q5], %[q4], %[sec4] \n\t" + "append %[q6], %[sec3], 16 \n\t" + "append %[q4], %[sec4], 16 \n\t" + + : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3), + [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5), + [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4) + :); + + filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0, + p3, p2, q0, q1, q2, q3, &hev, &mask, &flat); + + flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2); + + /* f0 */ + if (((flat2 == 0) && (flat == 0) && (mask != 0)) || + ((flat2 != 0) && (flat == 0) && (mask != 0))) { + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + STORE_F0() + } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) && + (mask == 0xFFFFFFFF)) { + /* f2 */ + PACK_LEFT_0TO3() + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_0TO3() + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + STORE_F2() + } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) { + /* f1 */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + STORE_F1() + } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) { + /* f0 + f1 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + /* left 2 element operation */ + PACK_LEFT_0TO3() + mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l); + + /* right 2 element operation */ + PACK_RIGHT_0TO3() + mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r); + + if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r), + [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + + : + : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r), + [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l), + [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], +1(%[s1]) \n\t" + "sb %[q2_l], +2(%[s1]) \n\t" + + : + : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l), + [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) { + /* f0+f1+f2 */ + filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0); + + PACK_LEFT_0TO3() + mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1, + &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1); + + PACK_RIGHT_0TO3() + mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1, + &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1); + + PACK_LEFT_4TO7() + wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l, + &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l, + &q6_l, &q7_l); + + PACK_RIGHT_4TO7() + wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r, + &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r, + &q6_r, &q7_r); + + if (mask & flat & flat2 & 0x000000FF) { + __asm__ __volatile__( + "sb %[p6_r], -7(%[s4]) \n\t" + "sb %[p5_r], -6(%[s4]) \n\t" + "sb %[p4_r], -5(%[s4]) \n\t" + "sb %[p3_r], -4(%[s4]) \n\t" + "sb %[p2_r], -3(%[s4]) \n\t" + "sb %[p1_r], -2(%[s4]) \n\t" + "sb %[p0_r], -1(%[s4]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s4] "r"(s4)); + + __asm__ __volatile__( + "sb %[q0_r], (%[s4]) \n\t" + "sb %[q1_r], +1(%[s4]) \n\t" + "sb %[q2_r], +2(%[s4]) \n\t" + "sb %[q3_r], +3(%[s4]) \n\t" + "sb %[q4_r], +4(%[s4]) \n\t" + "sb %[q5_r], +5(%[s4]) \n\t" + "sb %[q6_r], +6(%[s4]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s4] "r"(s4)); + } else if (mask & flat & 0x000000FF) { + __asm__ __volatile__( + "sb %[p2_r_f1], -3(%[s4]) \n\t" + "sb %[p1_r_f1], -2(%[s4]) \n\t" + "sb %[p0_r_f1], -1(%[s4]) \n\t" + "sb %[q0_r_f1], (%[s4]) \n\t" + "sb %[q1_r_f1], +1(%[s4]) \n\t" + "sb %[q2_r_f1], +2(%[s4]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4)); + } else if (mask & 0x000000FF) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s4]) \n\t" + "sb %[p0_f0], -1(%[s4]) \n\t" + "sb %[q0_f0], (%[s4]) \n\t" + "sb %[q1_f0], +1(%[s4]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s4] "r"(s4)); + } + + __asm__ __volatile__( + "srl %[p6_r], %[p6_r], 16 \n\t" + "srl %[p5_r], %[p5_r], 16 \n\t" + "srl %[p4_r], %[p4_r], 16 \n\t" + "srl %[p3_r], %[p3_r], 16 \n\t" + "srl %[p2_r], %[p2_r], 16 \n\t" + "srl %[p1_r], %[p1_r], 16 \n\t" + "srl %[p0_r], %[p0_r], 16 \n\t" + "srl %[q0_r], %[q0_r], 16 \n\t" + "srl %[q1_r], %[q1_r], 16 \n\t" + "srl %[q2_r], %[q2_r], 16 \n\t" + "srl %[q3_r], %[q3_r], 16 \n\t" + "srl %[q4_r], %[q4_r], 16 \n\t" + "srl %[q5_r], %[q5_r], 16 \n\t" + "srl %[q6_r], %[q6_r], 16 \n\t" + + : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r), + [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r), + [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), + [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), + [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r) + :); + + __asm__ __volatile__( + "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t" + "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t" + "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t" + "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t" + "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t" + "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1), + [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1), + [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p6_r], -7(%[s3]) \n\t" + "sb %[p5_r], -6(%[s3]) \n\t" + "sb %[p4_r], -5(%[s3]) \n\t" + "sb %[p3_r], -4(%[s3]) \n\t" + "sb %[p2_r], -3(%[s3]) \n\t" + "sb %[p1_r], -2(%[s3]) \n\t" + "sb %[p0_r], -1(%[s3]) \n\t" + + : + : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r), + [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), + [p0_r] "r"(p0_r), [s3] "r"(s3)); + + __asm__ __volatile__( + "sb %[q0_r], (%[s3]) \n\t" + "sb %[q1_r], +1(%[s3]) \n\t" + "sb %[q2_r], +2(%[s3]) \n\t" + "sb %[q3_r], +3(%[s3]) \n\t" + "sb %[q4_r], +4(%[s3]) \n\t" + "sb %[q5_r], +5(%[s3]) \n\t" + "sb %[q6_r], +6(%[s3]) \n\t" + + : + : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r), + [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), + [q6_r] "r"(q6_r), [s3] "r"(s3)); + } else if (mask & flat & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p2_r_f1], -3(%[s3]) \n\t" + "sb %[p1_r_f1], -2(%[s3]) \n\t" + "sb %[p0_r_f1], -1(%[s3]) \n\t" + "sb %[q0_r_f1], (%[s3]) \n\t" + "sb %[q1_r_f1], +1(%[s3]) \n\t" + "sb %[q2_r_f1], +2(%[s3]) \n\t" + + : + : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1), + [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1), + [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3)); + } else if (mask & 0x0000FF00) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s3]) \n\t" + "sb %[p0_f0], -1(%[s3]) \n\t" + "sb %[q0_f0], (%[s3]) \n\t" + "sb %[q1_f0], +1(%[s3]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s3] "r"(s3)); + } + + __asm__ __volatile__( + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p6_l], -7(%[s2]) \n\t" + "sb %[p5_l], -6(%[s2]) \n\t" + "sb %[p4_l], -5(%[s2]) \n\t" + "sb %[p3_l], -4(%[s2]) \n\t" + "sb %[p2_l], -3(%[s2]) \n\t" + "sb %[p1_l], -2(%[s2]) \n\t" + "sb %[p0_l], -1(%[s2]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s2] "r"(s2)); + + __asm__ __volatile__( + "sb %[q0_l], (%[s2]) \n\t" + "sb %[q1_l], +1(%[s2]) \n\t" + "sb %[q2_l], +2(%[s2]) \n\t" + "sb %[q3_l], +3(%[s2]) \n\t" + "sb %[q4_l], +4(%[s2]) \n\t" + "sb %[q5_l], +5(%[s2]) \n\t" + "sb %[q6_l], +6(%[s2]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s2] "r"(s2)); + } else if (mask & flat & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p2_l_f1], -3(%[s2]) \n\t" + "sb %[p1_l_f1], -2(%[s2]) \n\t" + "sb %[p0_l_f1], -1(%[s2]) \n\t" + "sb %[q0_l_f1], (%[s2]) \n\t" + "sb %[q1_l_f1], +1(%[s2]) \n\t" + "sb %[q2_l_f1], +2(%[s2]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2)); + } else if (mask & 0x00FF0000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s2]) \n\t" + "sb %[p0_f0], -1(%[s2]) \n\t" + "sb %[q0_f0], (%[s2]) \n\t" + "sb %[q1_f0], +1(%[s2]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s2] "r"(s2)); + } + + __asm__ __volatile__( + "srl %[p6_l], %[p6_l], 16 \n\t" + "srl %[p5_l], %[p5_l], 16 \n\t" + "srl %[p4_l], %[p4_l], 16 \n\t" + "srl %[p3_l], %[p3_l], 16 \n\t" + "srl %[p2_l], %[p2_l], 16 \n\t" + "srl %[p1_l], %[p1_l], 16 \n\t" + "srl %[p0_l], %[p0_l], 16 \n\t" + "srl %[q0_l], %[q0_l], 16 \n\t" + "srl %[q1_l], %[q1_l], 16 \n\t" + "srl %[q2_l], %[q2_l], 16 \n\t" + "srl %[q3_l], %[q3_l], 16 \n\t" + "srl %[q4_l], %[q4_l], 16 \n\t" + "srl %[q5_l], %[q5_l], 16 \n\t" + "srl %[q6_l], %[q6_l], 16 \n\t" + + : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l), + [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l), + [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l), + [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l), + [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l) + :); + + __asm__ __volatile__( + "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t" + "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t" + "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t" + "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t" + "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t" + "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t" + "srl %[p1_f0], %[p1_f0], 8 \n\t" + "srl %[p0_f0], %[p0_f0], 8 \n\t" + "srl %[q0_f0], %[q0_f0], 8 \n\t" + "srl %[q1_f0], %[q1_f0], 8 \n\t" + + : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1), + [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1), + [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1), + [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), + [q1_f0] "+r"(q1_f0) + :); + + if (mask & flat & flat2 & 0xFF000000) { + __asm__ __volatile__( + "sb %[p6_l], -7(%[s1]) \n\t" + "sb %[p5_l], -6(%[s1]) \n\t" + "sb %[p4_l], -5(%[s1]) \n\t" + "sb %[p3_l], -4(%[s1]) \n\t" + "sb %[p2_l], -3(%[s1]) \n\t" + "sb %[p1_l], -2(%[s1]) \n\t" + "sb %[p0_l], -1(%[s1]) \n\t" + + : + : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), + [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), + [p0_l] "r"(p0_l), [s1] "r"(s1)); + + __asm__ __volatile__( + "sb %[q0_l], (%[s1]) \n\t" + "sb %[q1_l], 1(%[s1]) \n\t" + "sb %[q2_l], 2(%[s1]) \n\t" + "sb %[q3_l], 3(%[s1]) \n\t" + "sb %[q4_l], 4(%[s1]) \n\t" + "sb %[q5_l], 5(%[s1]) \n\t" + "sb %[q6_l], 6(%[s1]) \n\t" + + : + : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l), + [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), + [q6_l] "r"(q6_l), [s1] "r"(s1)); + } else if (mask & flat & 0xFF000000) { + __asm__ __volatile__( + "sb %[p2_l_f1], -3(%[s1]) \n\t" + "sb %[p1_l_f1], -2(%[s1]) \n\t" + "sb %[p0_l_f1], -1(%[s1]) \n\t" + "sb %[q0_l_f1], (%[s1]) \n\t" + "sb %[q1_l_f1], +1(%[s1]) \n\t" + "sb %[q2_l_f1], +2(%[s1]) \n\t" + + : + : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1), + [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1), + [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1)); + } else if (mask & 0xFF000000) { + __asm__ __volatile__( + "sb %[p1_f0], -2(%[s1]) \n\t" + "sb %[p0_f0], -1(%[s1]) \n\t" + "sb %[q0_f0], (%[s1]) \n\t" + "sb %[q1_f0], +1(%[s1]) \n\t" + + : + : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0), + [q1_f0] "r"(q1_f0), [s1] "r"(s1)); + } + } + } +} +#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h new file mode 100644 index 000000000..450594262 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/loopfilter_msa.h @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_LOOPFILTER_MSA_H_ +#define AOM_DSP_LOOPFILTER_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" + +#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt = filt & (v16i8)hev_in; \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + /* combine left and right part */ \ + filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ + \ + filt = filt & (v16i8)mask_in; \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } + +#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ + p1_out, p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ + v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ + v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + \ + filt = filt & (v16i8)hev_in; \ + \ + q0_sub_p0 = q0_m - p0_m; \ + filt_sign = __msa_clti_s_b(filt, 0); \ + \ + cnst3h = __msa_ldi_h(3); \ + q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ + filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ + filt_r += q0_sub_p0_r; \ + filt_r = __msa_sat_s_h(filt_r, 7); \ + \ + q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ + q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ + filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ + filt_l += q0_sub_p0_l; \ + filt_l = __msa_sat_s_h(filt_l, 7); \ + \ + filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ + filt = filt & (v16i8)mask_in; \ + \ + cnst4b = __msa_ldi_b(4); \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= 3; \ + \ + cnst3b = __msa_ldi_b(3); \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= 3; \ + \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + \ + filt = __msa_srari_b(filt1, 1); \ + hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ + filt = filt & (v16i8)hev_in; \ + \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ + } + +#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + { \ + v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp_flat4 = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp_flat4 < (v16u8)flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ + } + +#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ + q6_in, q7_in, flat_in, flat2_out) \ + { \ + v16u8 tmp_flat5, zero_in = { 0 }; \ + v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ + v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ + \ + tmp_flat5 = __msa_ori_b(zero_in, 1); \ + p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ + q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ + p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ + q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \ + p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \ + q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \ + p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \ + q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \ + \ + p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \ + flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \ + flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \ + p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \ + flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \ + p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ + flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ + \ + flat2_out = (tmp_flat5 < (v16u8)flat2_out); \ + flat2_out = __msa_xori_b(flat2_out, 0xff); \ + flat2_out = flat2_out & flat_in; \ + } + +#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ + q1_filt8_out, q2_filt8_out) \ + { \ + v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ + \ + tmp_filt8_2 = p2_in + p1_in + p0_in; \ + tmp_filt8_0 = p3_in << 1; \ + \ + tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \ + tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = q2_in + q1_in + q0_in; \ + tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \ + tmp_filt8_0 = tmp_filt8_2 + (p0_in); \ + tmp_filt8_0 = tmp_filt8_0 + (p3_in); \ + p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \ + \ + tmp_filt8_0 = q2_in + q3_in; \ + tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \ + tmp_filt8_1 = q3_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \ + q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_0 = tmp_filt8_2 + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + q0_in; \ + q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + \ + tmp_filt8_1 = tmp_filt8_0 - p2_in; \ + tmp_filt8_0 = q1_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \ + q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ + } + +#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ + limit_in, b_limit_in, thresh_in, hev_out, mask_out, \ + flat_out) \ + { \ + v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \ + v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \ + \ + /* absolute subtraction of pixel values */ \ + p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \ + p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \ + p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \ + q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \ + q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \ + q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \ + p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \ + p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \ + \ + /* calculation of hev */ \ + flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \ + hev_out = thresh_in < (v16u8)flat_out; \ + \ + /* calculation of mask */ \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \ + p1_asub_q1_m >>= 1; \ + p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \ + \ + mask_out = b_limit_in < p0_asub_q0_m; \ + mask_out = __msa_max_u_b(flat_out, mask_out); \ + p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \ + mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \ + q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \ + mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \ + \ + mask_out = limit_in < (v16u8)mask_out; \ + mask_out = __msa_xori_b(mask_out, 0xff); \ + } +#endif /* AOM_DSP_LOOPFILTER_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h new file mode 100644 index 000000000..48fbcfd47 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/macros_msa.h @@ -0,0 +1,2057 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_MACROS_MSA_H_ +#define AOM_DSP_MIPS_MACROS_MSA_H_ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" + +#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) + +#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) + +#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc)) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) + +#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) + +#if (__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m = 0; \ + \ + val0_m = LW(psrc_m); \ + val1_m = LW(psrc_m + 4); \ + \ + val_m = (uint64_t)(val1_m); \ + val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ + \ + val_m; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint64_t val_m = (val); \ + \ + __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } +#else // !(__mips_isa_rev >= 6) +#define LH(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint16_t val_m; \ + \ + __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#define LW(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint32_t val_m; \ + \ + __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) + +#if (__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m = (const uint8_t *)(psrc); \ + uint64_t val_m = 0; \ + \ + __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \ + \ + : [val_m] "=r"(val_m) \ + : [psrc_m] "m"(*psrc_m)); \ + \ + val_m; \ + }) +#else // !(__mips == 64) +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m_combined = 0; \ + \ + val0_m = LW(psrc_m1); \ + val1_m = LW(psrc_m1 + 4); \ + \ + val_m_combined = (uint64_t)(val1_m); \ + val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ + val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ + \ + val_m_combined; \ + }) +#endif // (__mips == 64) + +#define SH(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint16_t val_m = (val); \ + \ + __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SW(val, pdst) \ + { \ + uint8_t *pdst_m = (uint8_t *)(pdst); \ + const uint32_t val_m = (val); \ + \ + __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \ + \ + : [pdst_m] "=m"(*pdst_m) \ + : [val_m] "r"(val_m)); \ + } + +#define SD(val, pdst) \ + { \ + uint8_t *pdst_m1 = (uint8_t *)(pdst); \ + uint32_t val0_m, val1_m; \ + \ + val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ + val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ + \ + SW(val0_m, pdst_m1); \ + SW(val1_m, pdst_m1 + 4); \ + } +#endif // (__mips_isa_rev >= 6) + +/* Description : Load 4 words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1, out2, out3 + Details : Load word in 'out0' from (psrc) + Load word in 'out1' from (psrc + stride) + Load word in 'out2' from (psrc + 2 * stride) + Load word in 'out3' from (psrc + 3 * stride) +*/ +#define LW4(psrc, stride, out0, out1, out2, out3) \ + { \ + out0 = LW((psrc)); \ + out1 = LW((psrc) + stride); \ + out2 = LW((psrc) + 2 * stride); \ + out3 = LW((psrc) + 3 * stride); \ + } + +/* Description : Load double words with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load double word in 'out0' from (psrc) + Load double word in 'out1' from (psrc + stride) +*/ +#define LD2(psrc, stride, out0, out1) \ + { \ + out0 = LD((psrc)); \ + out1 = LD((psrc) + stride); \ + } +#define LD4(psrc, stride, out0, out1, out2, out3) \ + { \ + LD2((psrc), stride, out0, out1); \ + LD2((psrc) + 2 * stride, stride, out2, out3); \ + } + +/* Description : Store 4 words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store word from 'in0' to (pdst) + Store word from 'in1' to (pdst + stride) + Store word from 'in2' to (pdst + 2 * stride) + Store word from 'in3' to (pdst + 3 * stride) +*/ +#define SW4(in0, in1, in2, in3, pdst, stride) \ + { \ + SW(in0, (pdst)) \ + SW(in1, (pdst) + stride); \ + SW(in2, (pdst) + 2 * stride); \ + SW(in3, (pdst) + 3 * stride); \ + } + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store double word from 'in0' to (pdst) + Store double word from 'in1' to (pdst + stride) + Store double word from 'in2' to (pdst + 2 * stride) + Store double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4(in0, in1, in2, in3, pdst, stride) \ + { \ + SD(in0, (pdst)) \ + SD(in1, (pdst) + stride); \ + SD(in2, (pdst) + 2 * stride); \ + SD(in3, (pdst) + 3 * stride); \ + } + +/* Description : Load vectors with 16 byte elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_B(RTYPE, (psrc)); \ + out1 = LD_B(RTYPE, (psrc) + stride); \ + } +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) + +#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ + } +#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_B2(RTYPE, (psrc), stride, out0, out1); \ + LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) + +#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ + } +#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) +#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) + +#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \ + { \ + LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ + LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ + } +#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) + +#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } +#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) +#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) + +/* Description : Load vectors with 8 halfword elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Details : Load 8 halfword elements in 'out0' from (psrc) + Load 8 halfword elements in 'out1' from (psrc + stride) +*/ +#define LD_H2(RTYPE, psrc, stride, out0, out1) \ + { \ + out0 = LD_H(RTYPE, (psrc)); \ + out1 = LD_H(RTYPE, (psrc) + (stride)); \ + } +#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) + +#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ + { \ + LD_H2(RTYPE, (psrc), stride, out0, out1); \ + LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ + } +#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) + +#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7) \ + { \ + LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ + LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ + } +#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) + +#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \ + out7, out8, out9, out10, out11, out12, out13, out14, out15) \ + { \ + LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \ + out7); \ + LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \ + out13, out14, out15); \ + } +#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) + +/* Description : Load 4x4 block of signed halfword elements from 1D source + data into 4 vectors (Each vector with 4 signed halfwords) + Arguments : Input - psrc + Outputs - out0, out1, out2, out3 +*/ +#define LD4x4_SH(psrc, out0, out1, out2, out3) \ + { \ + out0 = LD_SH(psrc); \ + out2 = LD_SH(psrc + 8); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + } + +/* Description : Load 2 vectors of signed word elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - signed word +*/ +#define LD_SW2(psrc, stride, out0, out1) \ + { \ + out0 = LD_SW((psrc)); \ + out1 = LD_SW((psrc) + stride); \ + } + +/* Description : Store vectors of 16 byte elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_B(RTYPE, in0, (pdst)); \ + ST_B(RTYPE, in1, (pdst) + stride); \ + } +#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) + +#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_B2(RTYPE, in0, in1, (pdst), stride); \ + ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) + +#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ + ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } +#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2(RTYPE, in0, in1, pdst, stride) \ + { \ + ST_H(RTYPE, in0, (pdst)); \ + ST_H(RTYPE, in1, (pdst) + stride); \ + } +#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) + +#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ + { \ + ST_H2(RTYPE, in0, in1, (pdst), stride); \ + ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ + } +#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) + +#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ + { \ + ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ + ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ + } +#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) + +/* Description : Store vectors of word elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 4 word elements from 'in0' to (pdst) + Store 4 word elements from 'in1' to (pdst + stride) +*/ +#define ST_SW2(in0, in1, pdst, stride) \ + { \ + ST_SW(in0, (pdst)); \ + ST_SW(in1, (pdst) + stride); \ + } + +/* Description : Store 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Details : Index 'stidx' halfword element from 'in' vector is copied to + the GP register and stored to (pdst) + Index 'stidx+1' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + stride) + Index 'stidx+2' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 2 * stride) + Index 'stidx+3' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 3 * stride) +*/ +#define ST2x4_UB(in, stidx, pdst, stride) \ + { \ + uint16_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \ + out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \ + out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \ + out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \ + \ + SH(out0_m, pblk_2x4_m); \ + SH(out1_m, pblk_2x4_m + stride); \ + SH(out2_m, pblk_2x4_m + 2 * stride); \ + SH(out3_m, pblk_2x4_m + 3 * stride); \ + } + +/* Description : Store 4x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 word element from 'in' vector is copied to the GP + register and stored to (pdst) + Index 1 word element from 'in' vector is copied to the GP + register and stored to (pdst + stride) +*/ +#define ST4x2_UB(in, pdst, stride) \ + { \ + uint32_t out0_m, out1_m; \ + uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in, 0); \ + out1_m = __msa_copy_u_w((v4i32)in, 1); \ + \ + SW(out0_m, pblk_4x2_m); \ + SW(out1_m, pblk_4x2_m + stride); \ + } + +/* Description : Store 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Details : 'Idx0' word element from input vector 'in0' is copied to the + GP register and stored to (pdst) + 'Idx1' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + stride) + 'Idx2' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 2 * stride) + 'Idx3' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \ + { \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_w((v4i32)in0, idx0); \ + out1_m = __msa_copy_u_w((v4i32)in0, idx1); \ + out2_m = __msa_copy_u_w((v4i32)in1, idx2); \ + out3_m = __msa_copy_u_w((v4i32)in1, idx3); \ + \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ + } +#define ST4x8_UB(in0, in1, pdst, stride) \ + { \ + uint8_t *pblk_4x8 = (uint8_t *)(pdst); \ + \ + ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \ + ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ + } + +/* Description : Store 8x1 byte block to destination memory from input vector + Arguments : Inputs - in, pdst + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) +*/ +#define ST8x1_UB(in, pdst) \ + { \ + uint64_t out0_m; \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + SD(out0_m, pdst); \ + } + +/* Description : Store 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in' vector is copied to the + GP register and stored to (pdst + stride) +*/ +#define ST8x2_UB(in, pdst, stride) \ + { \ + uint64_t out0_m, out1_m; \ + uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in, 0); \ + out1_m = __msa_copy_u_d((v2i64)in, 1); \ + \ + SD(out0_m, pblk_8x2_m); \ + SD(out1_m, pblk_8x2_m + stride); \ + } + +/* Description : Store 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from 'in0' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in0' vector is copied to the + GP register and stored to (pdst + stride) + Index 0 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 2 * stride) + Index 1 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 3 * stride) +*/ +#define ST8x4_UB(in0, in1, pdst, stride) \ + { \ + uint64_t out0_m, out1_m, out2_m, out3_m; \ + uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \ + \ + out0_m = __msa_copy_u_d((v2i64)in0, 0); \ + out1_m = __msa_copy_u_d((v2i64)in0, 1); \ + out2_m = __msa_copy_u_d((v2i64)in1, 0); \ + out3_m = __msa_copy_u_d((v2i64)in1, 1); \ + \ + SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \ + } + +/* Description : average with rounding (in0 + in1 + 1) / 2. + Arguments : Inputs - in0, in1, in2, in3, + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned byte element from 'in0' vector is added with + each unsigned byte element from 'in1' vector. Then the average + with rounding is calculated and written to 'out0' +*/ +#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ + out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \ + } +#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) + +#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \ + AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \ + } +#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide with zero + Arguments : Inputs - in0, in1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'zero_m' vector are slid into 'in0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \ + { \ + v16i8 zero_m = { 0 }; \ + out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \ + } +#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) + +#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \ + slide_val) \ + { \ + SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \ + SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \ + } +#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) + +/* Description : Immediate number of elements to slide + Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by + value specified in the 'slide_val' +*/ +#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + { \ + out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ + out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \ + } +#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) +#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) + +#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \ + out2, slide_val) \ + { \ + SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \ + out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \ + } +#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) +#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) + +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ + } +#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) +#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) +#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) + +#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \ + out3) \ + { \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \ + VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \ + } +#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) +#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Unsigned byte elements from 'mult0' are multiplied with + unsigned byte elements from 'cnst0' producing a result + twice the size of input i.e. unsigned halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ + out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \ + } +#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) + +#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \ + } +#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) + +#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \ + } +#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) + +#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) + +/* Description : Dot product of word vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed word elements from 'mult0' are multiplied with + signed word elements from 'cnst0' producing a result + twice the size of input i.e. signed double word. + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ + out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \ + } +#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) + +/* Description : Dot product & addition of byte vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \ + } +#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) + +#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \ + cnst3, out0, out1, out2, out3) \ + { \ + DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \ + DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \ + } +#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) + +/* Description : Dot product & addition of halfword vector elements + Arguments : Inputs - mult0, mult1, cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \ + out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \ + } +#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) + +/* Description : Dot product & addition of double word vector elements + Arguments : Inputs - mult0, mult1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed word element from 'mult0' is multiplied with itself + producing an intermediate result twice the size of input + i.e. signed double word + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector +*/ +#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \ + out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \ + } +#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__) + +/* Description : Minimum values between unsigned elements of + either vector are copied to the output vector + Arguments : Inputs - in0, in1, min_vec + Outputs - in place operation + Return Type - as per RTYPE + Details : Minimum of unsigned halfword element values from 'in0' and + 'min_vec' are written to output vector 'in0' +*/ +#define MIN_UH2(RTYPE, in0, in1, min_vec) \ + { \ + in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ + in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \ + } +#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) + +#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \ + { \ + MIN_UH2(RTYPE, in0, in1, min_vec); \ + MIN_UH2(RTYPE, in2, in3, min_vec); \ + } +#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + between 0 & 255 + Arguments : Input - in + Output - out_m + Return Type - signed halfword +*/ +#define CLIP_SH_0_255(in) \ + ({ \ + v8i16 max_m = __msa_ldi_h(255); \ + v8i16 out_m; \ + \ + out_m = __msa_maxi_s_h((v8i16)in, 0); \ + out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \ + out_m; \ + }) +#define CLIP_SH2_0_255(in0, in1) \ + { \ + in0 = CLIP_SH_0_255(in0); \ + in1 = CLIP_SH_0_255(in1); \ + } +#define CLIP_SH4_0_255(in0, in1, in2, in3) \ + { \ + CLIP_SH2_0_255(in0, in1); \ + CLIP_SH2_0_255(in2, in3); \ + } + +/* Description : Horizontal addition of 4 signed word elements of input vector + Arguments : Input - in (signed word vector) + Output - sum_m (i32 sum) + Return Type - signed word (GP) + Details : 4 signed word elements of 'in' vector are added together and + the resulting integer sum is returned +*/ +#define HADD_SW_S32(in) \ + ({ \ + v2i64 res0_m, res1_m; \ + int32_t sum_m; \ + \ + res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + res1_m = __msa_splati_d(res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ + sum_m; \ + }) + +/* Description : Horizontal addition of 8 unsigned halfword elements + Arguments : Inputs - in (unsigned halfword vector) + Outputs - sum_m (u32 sum) + Return Type - unsigned word + Details : 8 unsigned halfword elements of input vector are added + together and the resulting integer sum is returned +*/ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 res_m; \ + v2u64 res0_m, res1_m; \ + uint32_t sum_m; \ + \ + res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + res0_m = __msa_hadd_u_d(res_m, res_m); \ + res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ + sum_m; \ + }) + +/* Description : Horizontal addition of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is added to + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HADD_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) + +#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + HADD_UB2(RTYPE, in0, in1, out0, out1); \ + HADD_UB2(RTYPE, in2, in3, out2, out3); \ + } +#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) + +/* Description : Horizontal subtraction of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is subtracted from + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \ + out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \ + } +#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) + +/* Description : SAD (Sum of Absolute Difference) + Arguments : Inputs - in0, in1, ref0, ref1 + Outputs - sad_m (halfword vector) + Return Type - unsigned halfword + Details : Absolute difference of all the byte elements from 'in0' with + 'ref0' is calculated and preserved in 'diff0'. Then even-odd + pairs are added together to generate 8 halfword results. +*/ +#define SAD_UB2_UH(in0, in1, ref0, ref1) \ + ({ \ + v16u8 diff0_m, diff1_m; \ + v8u16 sad_m = { 0 }; \ + \ + diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \ + diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \ + \ + sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \ + sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \ + \ + sad_m; \ + }) + +/* Description : Horizontal subtraction of signed halfword vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd halfword element from 'in0' is subtracted from + even signed halfword element from 'in0' (pairwise) and the + word result is written to 'out0' +*/ +#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \ + out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \ + } +#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) + +/* Description : Set element n input vector to GPR value + Arguments : Inputs - in0, in1, in2, in3 + Output - out + Return Type - as per RTYPE + Details : Set element 0 in vector 'out' to value specified in 'in0' +*/ +#define INSERT_W2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + } +#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) + +#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ + } +#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) +#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) + +#define INSERT_D2(RTYPE, in0, in1, out) \ + { \ + out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ + out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ + } +#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) +#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) + +/* Description : Interleave even byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ + out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ + } +#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) +#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) + +/* Description : Interleave even halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \ + } +#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) +#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) +#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave even word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \ + out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \ + } +#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) + +/* Description : Interleave even double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ + out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \ + } +#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) + +/* Description : Interleave left half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of byte elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) +#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) +#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) +#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) + +#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) +#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) +#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) + +/* Description : Interleave left half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) +#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) + +/* Description : Interleave left half of word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of word elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) +#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements of 'in0' and 'in1' are interleaved + and written to out0. +*/ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ + } +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) + +#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \ + out5, out6, out7) \ + { \ + ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3); \ + ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \ + out6, out7); \ + } +#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ + } +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) +#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) + +#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \ + } +#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) +#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) + +#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of double word elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \ + } +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \ + } +#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) + +#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) +#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ + } +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ + } +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ + } +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range. + The results are written in place +*/ +#define SAT_UH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \ + } +#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) + +#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_UH2(RTYPE, in0, in1, sat_val); \ + SAT_UH2(RTYPE, in2, in3, sat_val) \ + } +#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1) bits + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val + 1) bit range + The results are written in place +*/ +#define SAT_SH2(RTYPE, in0, in1, sat_val) \ + { \ + in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ + in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \ + } +#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) + +#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \ + { \ + SAT_SH2(RTYPE, in0, in1, sat_val); \ + SAT_SH2(RTYPE, in2, in3, sat_val); \ + } +#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) + +/* Description : Indexed halfword element values are replicated to all + elements in output vector + Arguments : Inputs - in, idx0, idx1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : 'idx0' element value from 'in' vector is replicated to all + elements in 'out0' vector + Valid index range for halfword operation is 0-7 +*/ +#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \ + { \ + out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \ + out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \ + } +#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) + +#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \ + { \ + SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \ + SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \ + } +#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) +#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' are copied to the left half of + 'out0' & even byte elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ + } +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) + +#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) +#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) +#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) + +/* Description : Pack even halfword elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' are copied to the left half of + 'out0' & even halfword elements of 'in1' are copied to the + right half of 'out0'. +*/ +#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \ + } +#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) +#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) + +#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double elements of 'in0' are copied to the left half of + 'out0' & even double elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \ + } +#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) +#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) + +#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ + PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) + +/* Description : Each byte element is logically xor'ed with immediate 128 + Arguments : Inputs - in0, in1 + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned byte element from input vector 'in0' is + logically xor'ed with 128 and the result is stored in-place. +*/ +#define XORI_B2_128(RTYPE, in0, in1) \ + { \ + in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ + in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \ + } +#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) +#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) + +#define XORI_B3_128(RTYPE, in0, in1, in2) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \ + } +#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) + +#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \ + { \ + XORI_B2_128(RTYPE, in0, in1); \ + XORI_B2_128(RTYPE, in2, in3); \ + } +#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) +#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) + +#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \ + { \ + XORI_B4_128(RTYPE, in0, in1, in2, in3); \ + XORI_B3_128(RTYPE, in4, in5, in6); \ + } +#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) + +/* Description : Average of signed halfword elements -> (a + b) / 2 + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each signed halfword element from 'in0' is added to each + signed halfword element of 'in1' with full precision resulting + in one extra bit in the result. The result is then divided by + 2 and written to 'out0' +*/ +#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \ + out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \ + out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \ + } +#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__) + +/* Description : Addition of signed halfword elements and signed saturation + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'in0' are added to signed + halfword elements of 'in1'. The result is then signed saturated + between halfword data type range +*/ +#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \ + { \ + out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \ + } +#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) + +#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3) \ + { \ + ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \ + } +#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) + +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is written in-place. +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ + } + +/* Description : Arithmetic shift right all elements of vector + (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is a GP variable. +*/ +#define SRA_4V(in0, in1, in2, in3, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ + } + +/* Description : Shift right arithmetic rounded words + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the number of bits in the corresponding element in the vector + 'shift'. The last discarded bit is added to shifted value for + rounding and the result is written in-place. + 'shift' is a vector. +*/ +#define SRAR_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \ + in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \ + } + +#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRAR_W2(RTYPE, in0, in1, shift) \ + SRAR_W2(RTYPE, in2, in3, shift) \ + } +#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) + +/* Description : Shift right arithmetic rounded (immediate) + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetically by + the value in 'shift'. The last discarded bit is added to the + shifted value for rounding and the result is written in-place. + 'shift' is an immediate value. +*/ +#define SRARI_H2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ + in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \ + } +#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) +#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) + +#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_H2(RTYPE, in0, in1, shift); \ + SRARI_H2(RTYPE, in2, in3, shift); \ + } +#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) +#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) + +#define SRARI_W2(RTYPE, in0, in1, shift) \ + { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ + } +#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \ + { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ + } +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Logical shift right all elements of vector (immediate) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - out0, out1, out2, out3 + Return Type - as per RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is an immediate value. +*/ +#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \ + { \ + out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \ + out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \ + out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \ + out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \ + } +#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ + } +#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + MUL2(in0, in1, in2, in3, out0, out1); \ + MUL2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Addition of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ + } +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ + } + +/* Description : Subtraction of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in1' is subtracted from 'in0' and result is + written to 'out0'. +*/ +#define SUB2(in0, in1, in2, in3, out0, out1) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + } +#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \ + { \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ + } + +/* Description : Sign extend halfword elements from right half of the vector + Arguments : Input - in (halfword vector) + Output - out (sign extended word vector) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved with same vector 'in0' to generate + 4 word elements keeping sign intact +*/ +#define UNPCK_R_SH_SW(in, out) \ + { \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h((v8i16)in, 0); \ + out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \ + } + +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Input - in (unsigned byte vector) + Outputs - out0, out1 (unsigned halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH(in, out0, out1) \ + { \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH(zero_m, in, out0, out1); \ + } + +/* Description : Sign extend halfword elements from input vector and return + the result in pair of vectors + Arguments : Input - in (halfword vector) + Outputs - out0, out1 (sign extended word vectors) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 4 signed word elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 4 signed word elements in 'out1' +*/ +#define UNPCK_SH_SW(in, out0, out1) \ + { \ + v8i16 tmp_m; \ + \ + tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ + } + +/* Description : Butterfly of 4 input vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Details : Butterfly operation +*/ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ + } + +/* Description : Butterfly of 8 input vectors + Arguments : Inputs - in0 ... in7 + Outputs - out0 .. out7 + Details : Butterfly operation +*/ +#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ + out3, out4, out5, out6, out7) \ + { \ + out0 = in0 + in7; \ + out1 = in1 + in6; \ + out2 = in2 + in5; \ + out3 = in3 + in4; \ + \ + out4 = in3 - in4; \ + out5 = in2 - in5; \ + out6 = in1 - in6; \ + out7 = in0 - in7; \ + } + +/* Description : Butterfly of 16 input vectors + Arguments : Inputs - in0 ... in15 + Outputs - out0 .. out15 + Details : Butterfly operation +*/ +#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \ + in11, in12, in13, in14, in15, out0, out1, out2, out3, \ + out4, out5, out6, out7, out8, out9, out10, out11, out12, \ + out13, out14, out15) \ + { \ + out0 = in0 + in15; \ + out1 = in1 + in14; \ + out2 = in2 + in13; \ + out3 = in3 + in12; \ + out4 = in4 + in11; \ + out5 = in5 + in10; \ + out6 = in6 + in9; \ + out7 = in7 + in8; \ + \ + out8 = in7 - in8; \ + out9 = in6 - in9; \ + out10 = in5 - in10; \ + out11 = in4 - in11; \ + out12 = in3 - in12; \ + out13 = in2 - in13; \ + out14 = in1 - in14; \ + out15 = in0 - in15; \ + } + +/* Description : Transpose input 8x8 byte block + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \ + tmp3_m); \ + ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \ + ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \ + ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \ + ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \ + SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \ + SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \ + } +#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) + +/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - unsigned byte +*/ +#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \ + in10, in11, in12, in13, in14, in15, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \ + ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \ + ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \ + ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \ + \ + tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \ + tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \ + tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \ + tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \ + out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \ + tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \ + out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \ + tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \ + \ + ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \ + out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \ + out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \ + out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \ + out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ + } + +/* Description : Transpose 4x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword +*/ +#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \ + ILVRL_W2_SH(s1_m, s0_m, out0, out2); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ + } + +/* Description : Transpose 4x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ + v8i16 zero_m = { 0 }; \ + \ + ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \ + tmp3_n); \ + ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \ + ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \ + \ + out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \ + out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \ + \ + out4 = zero_m; \ + out5 = zero_m; \ + out6 = zero_m; \ + out7 = zero_m; \ + } + +/* Description : Transpose 8x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \ + ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \ + ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ + } + +/* Description : Transpose 8x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \ + out1, out2, out3, out4, out5, out6, out7) \ + { \ + v8i16 s0_m, s1_m; \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \ + ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \ + ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \ + ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \ + ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \ + PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \ + tmp7_m, out0, out2, out4, out6); \ + out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \ + out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \ + out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \ + out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \ + } +#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) + +/* Description : Transpose 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed word +*/ +#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + \ + out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ + } + +/* Description : Add block 4x4 + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Least significant 4 bytes from each input vector are added to + the destination bytes, clipped between 0-255 and stored. +*/ +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \ + { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + v16i8 zero_m = { 0 }; \ + \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ + } + +/* Description : Pack even elements of input vectors & xor with 128 + Arguments : Inputs - in0, in1 + Output - out_m + Return Type - unsigned byte + Details : Signed byte even elements from 'in0' and 'in1' are packed + together in one vector and the resulting vector is xor'ed with + 128 to shift the range from signed to unsigned byte +*/ +#define PCKEV_XORI128_UB(in0, in1) \ + ({ \ + v16u8 out_m; \ + \ + out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \ + out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \ + out_m; \ + }) + +/* Description : Converts inputs to unsigned bytes, interleave, average & store + as 8x4 unsigned byte block + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, + pdst, stride +*/ +#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \ + pdst, stride) \ + { \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + tmp0_m = PCKEV_XORI128_UB(in0, in1); \ + tmp1_m = PCKEV_XORI128_UB(in2, in3); \ + ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ + AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ + } + +/* Description : Pack even byte elements and store byte vector in destination + memory + Arguments : Inputs - in0, in1, pdst +*/ +#define PCKEV_ST_SB(in0, in1, pdst) \ + { \ + v16i8 tmp_m; \ + \ + tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \ + ST_SB(tmp_m, (pdst)); \ + } + +/* Description : Horizontal 2 tap filter kernel code + Arguments : Inputs - in0, in1, mask, coeff, shift +*/ +#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \ + ({ \ + v16i8 tmp0_m; \ + v8u16 tmp1_m; \ + \ + tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \ + tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \ + tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \ + \ + tmp1_m; \ + }) +#endif /* AOM_DSP_MIPS_MACROS_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c new file mode 100644 index 000000000..258eb5c07 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/sad_msa.c @@ -0,0 +1,1529 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ + { \ + out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \ + out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \ + } +#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__) + +static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + diff = __msa_asub_u_b(src, ref); + sad += __msa_hadd_u_h(diff, diff); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB2(ref, 16, ref0, ref1); + ref += ref_stride; + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t sad = 0; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = HADD_UH_U32(sad0); + sad += HADD_UH_U32(sad1); + + return sad; +} + +static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 ref0, ref1, ref2, ref3, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + INSERT_W4_UB(src0, src1, src2, src3, src); + + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref00, ref11, ref22, ref33; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); + ref += (4 * ref_stride); + PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, + ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src, ref, ref0, ref1, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for (ht_cnt = height >> 1; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); + ref += ref_stride; + + sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); + ref += ref_stride; + + sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); +} + +static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v4u32 sad; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3); + ref0_4 = LD_UB(ref + 64); + ref += ref_stride; + + sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[0] = HADD_SW_S32((v4i32)sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[1] = HADD_SW_S32((v4i32)sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[2] = HADD_SW_S32((v4i32)sad); +} + +static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3, diff; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, src); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad4 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad5 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad6 += __msa_hadd_u_h(diff, diff); + + SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); + SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); + SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); + diff = __msa_asub_u_b(src, ref); + sad7 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref00, ref11, ref22, ref33; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); + ref += (4 * ref_stride); + PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, + ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); + SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); + PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); + sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src, ref0, ref1, ref; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); + diff = __msa_asub_u_b(src, ref); + sad4 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); + diff = __msa_asub_u_b(src, ref); + sad5 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); + diff = __msa_asub_u_b(src, ref); + sad6 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); + diff = __msa_asub_u_b(src, ref); + sad7 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); + diff = __msa_asub_u_b(src, ref); + sad4 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); + diff = __msa_asub_u_b(src, ref); + sad5 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); + diff = __msa_asub_u_b(src, ref); + sad6 += __msa_hadd_u_h(diff, diff); + + ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); + diff = __msa_asub_u_b(src, ref); + sad7 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + int32_t ht_cnt; + v16u8 src0, src1; + v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + v8u16 sad4 = { 0 }; + v8u16 sad5 = { 0 }; + v8u16 sad6 = { 0 }; + v8u16 sad7 = { 0 }; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); + ref += ref_stride; + + sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); + sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); + sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); + sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); + sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); + sad_array[4] = HADD_UH_U32(sad4); + sad_array[5] = HADD_UH_U32(sad5); + sad_array[6] = HADD_UH_U32(sad6); + sad_array[7] = HADD_UH_U32(sad7); +} + +static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, uint32_t *sad_array) { + const uint8_t *src_dup, *ref_dup; + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v8u16 sad3_0 = { 0 }; + v8u16 sad3_1 = { 0 }; + v4u32 sad; + + src_dup = src; + ref_dup = ref; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); + ref += ref_stride; + + sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); + sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3); + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[0] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[1] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[2] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad3_0, sad3_0); + sad += __msa_hadd_u_w(sad3_1, sad3_1); + sad_array[3] = HADD_SW_S32(sad); + + sad0_0 = (v8u16)__msa_ldi_h(0); + sad0_1 = (v8u16)__msa_ldi_h(0); + sad1_0 = (v8u16)__msa_ldi_h(0); + sad1_1 = (v8u16)__msa_ldi_h(0); + sad2_0 = (v8u16)__msa_ldi_h(0); + sad2_1 = (v8u16)__msa_ldi_h(0); + sad3_0 = (v8u16)__msa_ldi_h(0); + sad3_1 = (v8u16)__msa_ldi_h(0); + + for (ht_cnt = 64; ht_cnt--;) { + LD_UB4(src_dup, 16, src0, src1, src2, src3); + src_dup += src_stride; + LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); + ref_dup += ref_stride; + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4); + sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5); + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6); + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); + SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7); + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[4] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[5] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[6] = HADD_SW_S32(sad); + + sad = __msa_hadd_u_w(sad3_0, sad3_0); + sad += __msa_hadd_u_w(sad3_1, sad3_1); + sad_array[7] = HADD_SW_S32(sad); +} + +static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + INSERT_W4_UB(src0, src1, src2, src3, src); + src_ptr += (4 * src_stride); + + LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref0_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad0 += __msa_hadd_u_h(diff, diff); + + LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref1_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad1 += __msa_hadd_u_h(diff, diff); + + LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref2_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad2 += __msa_hadd_u_h(diff, diff); + + LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ref3_ptr += (4 * ref_stride); + + diff = __msa_asub_u_b(src, ref); + sad3 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref0_ptr += (4 * ref_stride); + LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7); + ref1_ptr += (4 * ref_stride); + LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11); + ref2_ptr += (4 * ref_stride); + LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15); + ref3_ptr += (4 * ref_stride); + + PCKEV_D2_UB(src1, src0, src3, src2, src0, src1); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1); + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1); + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1); + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + int32_t ht_cnt; + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + v16u8 src, ref0, ref1, ref2, ref3, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = (height >> 1); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref0 = LD_UB(ref0_ptr); + ref0_ptr += ref_stride; + ref1 = LD_UB(ref1_ptr); + ref1_ptr += ref_stride; + ref2 = LD_UB(ref2_ptr); + ref2_ptr += ref_stride; + ref3 = LD_UB(ref3_ptr); + ref3_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref1); + sad1 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref2); + sad2 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref3); + sad3 += __msa_hadd_u_h(diff, diff); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref0 = LD_UB(ref0_ptr); + ref0_ptr += ref_stride; + ref1 = LD_UB(ref1_ptr); + ref1_ptr += ref_stride; + ref2 = LD_UB(ref2_ptr); + ref2_ptr += ref_stride; + ref3 = LD_UB(ref3_ptr); + ref3_ptr += ref_stride; + + diff = __msa_asub_u_b(src, ref0); + sad0 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref1); + sad1 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref2); + sad2 += __msa_hadd_u_h(diff, diff); + diff = __msa_asub_u_b(src, ref3); + sad3 += __msa_hadd_u_h(diff, diff); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB2(src, 16, src0, src1); + src += src_stride; + + LD_UB2(ref0_ptr, 16, ref0, ref1); + ref0_ptr += ref_stride; + sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref1_ptr, 16, ref0, ref1); + ref1_ptr += ref_stride; + sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref2_ptr, 16, ref0, ref1); + ref2_ptr += ref_stride; + sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(ref3_ptr, 16, ref0, ref1); + ref3_ptr += ref_stride; + sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); + } + + sad_array[0] = HADD_UH_U32(sad0); + sad_array[1] = HADD_UH_U32(sad1); + sad_array[2] = HADD_UH_U32(sad2); + sad_array[3] = HADD_UH_U32(sad3); +} + +static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *const aref_ptr[], + int32_t ref_stride, int32_t height, + uint32_t *sad_array) { + const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr; + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8u16 sad0_0 = { 0 }; + v8u16 sad0_1 = { 0 }; + v8u16 sad1_0 = { 0 }; + v8u16 sad1_1 = { 0 }; + v8u16 sad2_0 = { 0 }; + v8u16 sad2_1 = { 0 }; + v8u16 sad3_0 = { 0 }; + v8u16 sad3_1 = { 0 }; + + ref0_ptr = aref_ptr[0]; + ref1_ptr = aref_ptr[1]; + ref2_ptr = aref_ptr[2]; + ref3_ptr = aref_ptr[3]; + + for (ht_cnt = height; ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + + LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3); + ref0_ptr += ref_stride; + sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3); + ref1_ptr += ref_stride; + sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3); + ref2_ptr += ref_stride; + sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + + LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3); + ref3_ptr += ref_stride; + sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); + sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); + } + + sad_array[0] = HADD_UH_U32(sad0_0); + sad_array[0] += HADD_UH_U32(sad0_1); + sad_array[1] = HADD_UH_U32(sad1_0); + sad_array[1] += HADD_UH_U32(sad1_1); + sad_array[2] = HADD_UH_U32(sad2_0); + sad_array[2] += HADD_UH_U32(sad2_1); + sad_array[3] = HADD_UH_U32(sad3_0); + sad_array[3] += HADD_UH_U32(sad3_1); +} + +static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff, pred, comp; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + comp = __msa_aver_u_b(pred, ref); + diff = __msa_asub_u_b(src, comp); + sad += __msa_hadd_u_h(diff, diff); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 diff0, diff1, pred0, pred1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1); + sad += SAD_UB2_UH(src0, src1, diff0, diff1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3, comp0, comp1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 3); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += (4 * 16); + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + + LD_UB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3); + ref += (4 * ref_stride); + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += (4 * 16); + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 comp0, comp1; + v8u16 sad = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 16, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + + LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6); + LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7); + ref += (4 * ref_stride); + + LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6); + LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7); + sec_pred += (4 * 32); + + AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1); + sad += SAD_UB2_UH(src0, src1, comp0, comp1); + AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1); + sad += SAD_UB2_UH(src2, src3, comp0, comp1); + AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1); + sad += SAD_UB2_UH(src4, src5, comp0, comp1); + AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1); + sad += SAD_UB2_UH(src6, src7, comp0, comp1); + } + + return HADD_UH_U32(sad); +} + +static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + int32_t height, const uint8_t *sec_pred) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 comp0, comp1, comp2, comp3; + v16u8 pred0, pred1, pred2, pred3; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v4u32 sad; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + + LD_UB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref += ref_stride; + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0, + comp1, comp2, comp3); + sad0 += SAD_UB2_UH(src0, src1, comp0, comp1); + sad1 += SAD_UB2_UH(src2, src3, comp2, comp3); + } + + sad = __msa_hadd_u_w(sad0, sad0); + sad += __msa_hadd_u_w(sad1, sad1); + + return HADD_SW_S32(sad); +} + +#define AOM_SAD_4xHEIGHT_MSA(height) \ + uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_4width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_8xHEIGHT_MSA(height) \ + uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_8width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_16xHEIGHT_MSA(height) \ + uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_16width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_32xHEIGHT_MSA(height) \ + uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_32width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_64xHEIGHT_MSA(height) \ + uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride) { \ + return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ + } + +#define AOM_SAD_4xHEIGHTx3_MSA(height) \ + void aom_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_8xHEIGHTx3_MSA(height) \ + void aom_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_16xHEIGHTx3_MSA(height) \ + void aom_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_32xHEIGHTx3_MSA(height) \ + void aom_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_64xHEIGHTx3_MSA(height) \ + void aom_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_4xHEIGHTx8_MSA(height) \ + void aom_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_8xHEIGHTx8_MSA(height) \ + void aom_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_16xHEIGHTx8_MSA(height) \ + void aom_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_32xHEIGHTx8_MSA(height) \ + void aom_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_64xHEIGHTx8_MSA(height) \ + void aom_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sads) { \ + sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ + } + +#define AOM_SAD_4xHEIGHTx4D_MSA(height) \ + void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_8xHEIGHTx4D_MSA(height) \ + void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_16xHEIGHTx4D_MSA(height) \ + void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_32xHEIGHTx4D_MSA(height) \ + void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_SAD_64xHEIGHTx4D_MSA(height) \ + void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *const refs[], \ + int32_t ref_stride, uint32_t *sads) { \ + sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ + } + +#define AOM_AVGSAD_4xHEIGHT_MSA(height) \ + uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_8xHEIGHT_MSA(height) \ + uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \ + const uint8_t *ref, int32_t ref_stride, \ + const uint8_t *second_pred) { \ + return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_16xHEIGHT_MSA(height) \ + uint32_t aom_sad16x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_32xHEIGHT_MSA(height) \ + uint32_t aom_sad32x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +#define AOM_AVGSAD_64xHEIGHT_MSA(height) \ + uint32_t aom_sad64x##height##_avg_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, const uint8_t *second_pred) { \ + return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \ + second_pred); \ + } + +/* clang-format off */ +// 64x64 +AOM_SAD_64xHEIGHT_MSA(64) +AOM_SAD_64xHEIGHTx3_MSA(64) +AOM_SAD_64xHEIGHTx8_MSA(64) +AOM_SAD_64xHEIGHTx4D_MSA(64) +AOM_AVGSAD_64xHEIGHT_MSA(64) + +// 64x32 +AOM_SAD_64xHEIGHT_MSA(32) +AOM_SAD_64xHEIGHTx3_MSA(32) +AOM_SAD_64xHEIGHTx8_MSA(32) +AOM_SAD_64xHEIGHTx4D_MSA(32) +AOM_AVGSAD_64xHEIGHT_MSA(32) + +// 32x64 +AOM_SAD_32xHEIGHT_MSA(64) +AOM_SAD_32xHEIGHTx3_MSA(64) +AOM_SAD_32xHEIGHTx8_MSA(64) +AOM_SAD_32xHEIGHTx4D_MSA(64) +AOM_AVGSAD_32xHEIGHT_MSA(64) + +// 32x32 +AOM_SAD_32xHEIGHT_MSA(32) +AOM_SAD_32xHEIGHTx3_MSA(32) +AOM_SAD_32xHEIGHTx8_MSA(32) +AOM_SAD_32xHEIGHTx4D_MSA(32) +AOM_AVGSAD_32xHEIGHT_MSA(32) + +// 32x16 +AOM_SAD_32xHEIGHT_MSA(16) +AOM_SAD_32xHEIGHTx3_MSA(16) +AOM_SAD_32xHEIGHTx8_MSA(16) +AOM_SAD_32xHEIGHTx4D_MSA(16) +AOM_AVGSAD_32xHEIGHT_MSA(16) + +// 16x32 +AOM_SAD_16xHEIGHT_MSA(32) +AOM_SAD_16xHEIGHTx3_MSA(32) +AOM_SAD_16xHEIGHTx8_MSA(32) +AOM_SAD_16xHEIGHTx4D_MSA(32) +AOM_AVGSAD_16xHEIGHT_MSA(32) + +// 16x16 +AOM_SAD_16xHEIGHT_MSA(16) +AOM_SAD_16xHEIGHTx3_MSA(16) +AOM_SAD_16xHEIGHTx8_MSA(16) +AOM_SAD_16xHEIGHTx4D_MSA(16) +AOM_AVGSAD_16xHEIGHT_MSA(16) + +// 16x8 +AOM_SAD_16xHEIGHT_MSA(8) +AOM_SAD_16xHEIGHTx3_MSA(8) +AOM_SAD_16xHEIGHTx8_MSA(8) +AOM_SAD_16xHEIGHTx4D_MSA(8) +AOM_AVGSAD_16xHEIGHT_MSA(8) + +// 8x16 +AOM_SAD_8xHEIGHT_MSA(16) +AOM_SAD_8xHEIGHTx3_MSA(16) +AOM_SAD_8xHEIGHTx8_MSA(16) +AOM_SAD_8xHEIGHTx4D_MSA(16) +AOM_AVGSAD_8xHEIGHT_MSA(16) + +// 8x8 +AOM_SAD_8xHEIGHT_MSA(8) +AOM_SAD_8xHEIGHTx3_MSA(8) +AOM_SAD_8xHEIGHTx8_MSA(8) +AOM_SAD_8xHEIGHTx4D_MSA(8) +AOM_AVGSAD_8xHEIGHT_MSA(8) + +// 8x4 +AOM_SAD_8xHEIGHT_MSA(4) +AOM_SAD_8xHEIGHTx3_MSA(4) +AOM_SAD_8xHEIGHTx8_MSA(4) +AOM_SAD_8xHEIGHTx4D_MSA(4) +AOM_AVGSAD_8xHEIGHT_MSA(4) + +// 4x8 +AOM_SAD_4xHEIGHT_MSA(8) +AOM_SAD_4xHEIGHTx3_MSA(8) +AOM_SAD_4xHEIGHTx8_MSA(8) +AOM_SAD_4xHEIGHTx4D_MSA(8) +AOM_AVGSAD_4xHEIGHT_MSA(8) + +// 4x4 +AOM_SAD_4xHEIGHT_MSA(4) +AOM_SAD_4xHEIGHTx3_MSA(4) +AOM_SAD_4xHEIGHTx8_MSA(4) +AOM_SAD_4xHEIGHTx4D_MSA(4) +AOM_AVGSAD_4xHEIGHT_MSA(4) + /* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c new file mode 100644 index 000000000..3eb85107d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c @@ -0,0 +1,1795 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" +#include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/variance.h" + +static const uint8_t bilinear_filters_msa[8][2] = { + { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, + { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, +}; + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } + +#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + sse - (((int64_t)diff * diff) >> shift) + +static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t height, + int32_t *diff) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 pred, src = { 0 }; + v16u8 ref = { 0 }; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t height, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src, ref, pred; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + + pred = LD_UB(sec_pred); + sec_pred += 16; + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + src = __msa_aver_u_b(src, pred); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1, pred0, pred1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1, pred0, pred1; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + const uint8_t *ref_ptr, + int32_t ref_stride, + const uint8_t *sec_pred, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v8i16 avg2 = { 0 }; + v8i16 avg3 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 32; ht_cnt--;) { + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + + LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3); + sec_pred += 64; + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + vec += __msa_hadd_s_w(avg2, avg2); + vec += __msa_hadd_s_w(avg3, avg3); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 filt0, ref = { 0 }; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); + src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); + CALC_MSE_AVG_B(src0, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 filt0, out, ref0, ref1, ref2, ref3; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 dst0, dst1, dst2, dst3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst += (4 * dst_stride); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1, + src2, src3); + CALC_MSE_AVG_B(src0, dst0, var, avg); + CALC_MSE_AVG_B(src1, dst1, var, avg); + CALC_MSE_AVG_B(src2, dst2, var, avg); + CALC_MSE_AVG_B(src3, dst3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4, out; + v16u8 src10_r, src32_r, src21_r, src43_r; + v16u8 ref = { 0 }; + v16u8 src2110, src4332; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + v8u16 tmp0, tmp1; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 ref0, ref1, ref2, ref3; + v8u16 vec0, vec1, vec2, vec3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1, out2, out3; + v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + src0 = src4; + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride, + filter, height, &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out, ref = { 0 }; + v16u8 filt_vt, filt_hz, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4; + v8u16 tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1; + v8u16 tmp0, tmp1, tmp2, tmp3; + v16u8 filt_vt, filt_hz, vec0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3; + v8u16 tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + LD_UB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + CALC_MSE_AVG_B(src2, ref2, var, avg); + CALC_MSE_AVG_B(src3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride, + filter_horiz, filter_vert, height, + &diff0[loop_cnt]); + src += 16; + dst += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 out, pred, filt0, ref = { 0 }; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + ILVEV_W2_SB(src0, src1, src2, src3, src0, src2); + out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 out, pred, filt0; + v16u8 ref0, ref1, ref2, ref3; + v16i8 src0, src1, src2, src3; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 vec0, vec1, vec2, vec3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src1, src2, src3); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, + vec2, vec3); + SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); + PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1, + src2, src3); + out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0); + + pred = LD_UB(sec_pred); + sec_pred += 16; + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref0, var, avg); + out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2); + pred = LD_UB(sec_pred); + sec_pred += 16; + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v16u8 dst0, dst1, dst2, dst3; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 pred0, pred1, pred2, pred3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 out0, out1, out2, out3, out4, out5, out6, out7; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_SB4(src, src_stride, src0, src2, src4, src6); + LD_SB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); + dst += (4 * dst_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); + VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); + VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5); + VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, + out2, out3); + DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, + out6, out7); + SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); + SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); + PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1, + tmp2, tmp3); + AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1, + tmp2, tmp3); + + CALC_MSE_AVG_B(tmp0, dst0, var, avg); + CALC_MSE_AVG_B(tmp1, dst1, var, avg); + CALC_MSE_AVG_B(tmp2, dst2, var, avg); + CALC_MSE_AVG_B(tmp3, dst3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, + sec_pred, filter, height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_h_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 src10_r, src32_r, src21_r, src43_r; + v16u8 out, pred, ref = { 0 }; + v16u8 src2110, src4332, filt0; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + v8u16 tmp0, tmp1; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, + src32_r, src43_r); + ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); + DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, filt0; + v8u16 vec0, vec1, vec2, vec3; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, + vec3); + DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, + tmp2, tmp3); + SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1); + AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v16u8 src0, src1, src2, src3, src4; + v16u8 out0, out1, out2, out3, filt0; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v8u16 tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter); + filt0 = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2); + ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3); + DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6); + ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7); + DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); + + src0 = src4; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, + sec_pred, filter, height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_v_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter, + int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += + subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred, + filter, height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 }; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 out, pred, ref = { 0 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + pred = LD_UB(sec_pred); + sec_pred += 16; + LW4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); + hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); + hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + out = __msa_aver_u_b(out, pred); + CALC_MSE_AVG_B(out, ref, var, avg); + src0 = src4; + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 ref0, ref1, ref2, ref3; + v16u8 src0, src1, src2, src3, src4; + v16u8 pred0, pred1, out0, out1; + v16u8 filt_hz, filt_vt, vec0; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + src0 = LD_UB(src); + src += src_stride; + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src1, src2, src3, src4); + src += (4 * src_stride); + LD_UB2(sec_pred, 16, pred0, pred1); + sec_pred += 32; + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1); + hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp0 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp1 = __msa_dotp_u_h(vec0, filt_vt); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); + tmp2 = __msa_dotp_u_h(vec0, filt_vt); + hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + + vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); + tmp3 = __msa_dotp_u_h(vec0, filt_vt); + + SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); + PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1); + AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t subpel_avg_ssediff_16w_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) { + int16_t filtval; + uint32_t loop_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 ref0, ref1, ref2, ref3; + v16u8 pred0, pred1, pred2, pred3; + v16u8 out0, out1, out2, out3; + v16u8 filt_hz, filt_vt, vec0, vec1; + v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; + v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + filtval = LH(filter_horiz); + filt_hz = (v16u8)__msa_fill_h(filtval); + filtval = LH(filter_vert); + filt_vt = (v16u8)__msa_fill_h(filtval); + + LD_UB2(src, 8, src0, src1); + src += src_stride; + + hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + + for (loop_cnt = (height >> 2); loop_cnt--;) { + LD_UB4(src, src_stride, src0, src2, src4, src6); + LD_UB4(src + 8, src_stride, src1, src3, src5, src7); + src += (4 * src_stride); + LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3); + sec_pred += (4 * width); + + hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); + hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); + hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); + ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); + DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); + SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); + out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); + + LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3); + dst += (4 * dst_stride); + + AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1, + out2, out3); + + CALC_MSE_AVG_B(out0, ref0, var, avg); + CALC_MSE_AVG_B(out1, ref1, var, avg); + CALC_MSE_AVG_B(out2, ref2, var, avg); + CALC_MSE_AVG_B(out3, ref3, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, diff, 16); +} + +static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[2]; + + for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 32); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1]; + + return sse; +} + +static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( + const uint8_t *src, int32_t src_stride, const uint8_t *dst, + int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz, + const uint8_t *filter_vert, int32_t height, int32_t *diff) { + uint32_t loop_cnt, sse = 0; + int32_t diff0[4]; + + for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) { + sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride, + sec_pred, filter_horiz, filter_vert, + height, &diff0[loop_cnt], 64); + src += 16; + dst += 16; + sec_pred += 16; + } + + *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3]; + + return sse; +} + +#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); +#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); + +#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_sub_pixel_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \ + uint32_t *sse) { \ + int32_t diff; \ + uint32_t var; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \ + src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_sse_diff_##wd##width_v_msa( \ + src, src_stride, ref, ref_stride, v_filter, ht, &diff); \ + } \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_sse_diff_##wd##width_h_msa( \ + src, src_stride, ref, ref_stride, h_filter, ht, &diff); \ + \ + var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } else { \ + var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \ + sse); \ + } \ + } \ + \ + return var; \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64) + +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32) +AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64) +/* clang-format on */ + +#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, ht, &diff); \ + } \ + } \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32) + +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16) +AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32) +/* clang-format on */ + +uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, + int32_t src_stride, + int32_t xoffset, int32_t yoffset, + const uint8_t *ref_ptr, + int32_t ref_stride, uint32_t *sse, + const uint8_t *sec_pred) { + int32_t diff; + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; + + if (yoffset) { + if (xoffset) { + *sse = sub_pixel_avg_sse_diff_32width_hv_msa( + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, + v_filter, 64, &diff); + } else { + *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + v_filter, 64, &diff); + } + } else { + if (xoffset) { + *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr, + ref_stride, sec_pred, + h_filter, 64, &diff); + } else { + *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride, + sec_pred, &diff); + } + } + + return VARIANCE_32Wx64H(*sse, diff); +} + +#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \ + uint32_t aom_sub_pixel_avg_variance64x##ht##_msa( \ + const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \ + int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ + uint32_t *sse, const uint8_t *sec_pred) { \ + int32_t diff; \ + const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + \ + if (yoffset) { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \ + v_filter, ht, &diff); \ + } else { \ + *sse = sub_pixel_avg_sse_diff_64width_v_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \ + &diff); \ + } \ + } else { \ + if (xoffset) { \ + *sse = sub_pixel_avg_sse_diff_64width_h_msa( \ + src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \ + &diff); \ + } else { \ + *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \ + ref_stride, sec_pred, &diff); \ + } \ + } \ + \ + return VARIANCE_64Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32) +AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64) +/* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c new file mode 100644 index 000000000..37b89765d --- /dev/null +++ b/third_party/aom/aom_dsp/mips/subtract_msa.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t src0, src1, src2, src3; + uint32_t pred0, pred1, pred2, pred3; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(pred0, pred1, pred2, pred3, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride)); +} + +static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *pred_ptr, int32_t pred_stride, + int16_t *diff_ptr, int32_t diff_stride) { + uint32_t loop_cnt; + uint64_t src0, src1, pred0, pred1; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 4; loop_cnt--;) { + LD2(src_ptr, src_stride, src0, src1); + src_ptr += (2 * src_stride); + LD2(pred_ptr, pred_stride, pred0, pred1); + pred_ptr += (2 * pred_stride); + + INSERT_D2_SB(src0, src1, src); + INSERT_D2_SB(pred0, pred1, pred); + ILVRL_B2_UB(src, pred, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff_ptr, diff_stride); + diff_ptr += (2 * diff_stride); + } +} + +static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + int8_t count; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (count = 2; count--;) { + LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += (8 * src_stride); + + LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6, + pred7); + pred += (8 * pred_stride); + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + diff += diff_stride; + } +} + +static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 8; loop_cnt--;) { + LD_SB2(src, 16, src0, src1); + src += src_stride; + LD_SB2(src, 16, src2, src3); + src += src_stride; + LD_SB2(src, 16, src4, src5); + src += src_stride; + LD_SB2(src, 16, src6, src7); + src += src_stride; + + LD_SB2(pred, 16, pred0, pred1); + pred += pred_stride; + LD_SB2(pred, 16, pred2, pred3); + pred += pred_stride; + LD_SB2(pred, 16, pred4, pred5); + pred += pred_stride; + LD_SB2(pred, 16, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + diff += diff_stride; + } +} + +static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *pred, int32_t pred_stride, + int16_t *diff, int32_t diff_stride) { + uint32_t loop_cnt; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + for (loop_cnt = 32; loop_cnt--;) { + LD_SB4(src, 16, src0, src1, src2, src3); + src += src_stride; + LD_SB4(src, 16, src4, src5, src6, src7); + src += src_stride; + + LD_SB4(pred, 16, pred0, pred1, pred2, pred3); + pred += pred_stride; + LD_SB4(pred, 16, pred4, pred5, pred6, pred7); + pred += pred_stride; + + ILVRL_B2_UB(src0, pred0, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src1, pred1, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src2, pred2, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src3, pred3, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + + ILVRL_B2_UB(src4, pred4, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff, 8); + ILVRL_B2_UB(src5, pred5, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 16, 8); + ILVRL_B2_UB(src6, pred6, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 32, 8); + ILVRL_B2_UB(src7, pred7, src_l0, src_l1); + HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); + ST_SH2(diff0, diff1, diff + 48, 8); + diff += diff_stride; + } +} + +void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + if (rows == cols) { + switch (rows) { + case 4: + sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 8: + sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 16: + sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 32: + sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + case 64: + sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr, + diff_stride); + break; + default: + aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } + } else { + aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, + pred_ptr, pred_stride); + } +} diff --git a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h new file mode 100644 index 000000000..cba5d4445 --- /dev/null +++ b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ +#define AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ + +#include "aom_dsp/mips/macros_msa.h" + +#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ + { \ + v8i16 k0_m = __msa_fill_h(cnst0); \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + s0_m = (v4i32)__msa_fill_h(cnst1); \ + k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ + \ + ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ + ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ + DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + \ + DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ + SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ + out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ + } + +#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \ + dst1, dst2, dst3) \ + { \ + v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ + v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ + \ + DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \ + tp4_m); \ + DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \ + tp8_m); \ + BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ + BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ + SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ + SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ + PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \ + dst1, dst2, dst3); \ + } + +#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \ + ({ \ + v8i16 dst_m; \ + v4i32 tp0_m, tp1_m; \ + \ + DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ + SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ + dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ + \ + dst_m; \ + }) + +#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \ + { \ + v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ + v8i16 madd_s0_m, madd_s1_m; \ + \ + ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \ + madd0_m, madd1_m, madd2_m, madd3_m); \ + SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ + } + +#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ + out2, out3) \ + { \ + v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ + v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ + \ + ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ + ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \ + cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ + DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \ + cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ + BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ + SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ + PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ + } +#endif // AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c new file mode 100644 index 000000000..745fdfc9c --- /dev/null +++ b/third_party/aom/aom_dsp/mips/variance_msa.c @@ -0,0 +1,632 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/mips/macros_msa.h" + +#define CALC_MSE_B(src, ref, var) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + } + +#define CALC_MSE_AVG_B(src, ref, var, sub) \ + { \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \ + HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \ + DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \ + \ + sub += res_l0_m + res_l1_m; \ + } + +#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift) + +#define VARIANCE_LARGE_WxH(sse, diff, shift) \ + sse - (((int64_t)diff * diff) >> shift) + +static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + int32_t ht_cnt; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src, ref; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src, ref, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height, int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg); + CALC_MSE_AVG_B(src1, ref1, var, avg); + } + + vec = __msa_hadd_s_w(avg, avg); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 16; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src2, ref2, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src3, ref3, var, avg1); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t *diff) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg0 = { 0 }; + v8i16 avg1 = { 0 }; + v8i16 avg2 = { 0 }; + v8i16 avg3 = { 0 }; + v4i32 vec, var = { 0 }; + + for (ht_cnt = 32; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_AVG_B(src0, ref0, var, avg0); + CALC_MSE_AVG_B(src1, ref1, var, avg1); + CALC_MSE_AVG_B(src2, ref2, var, avg2); + CALC_MSE_AVG_B(src3, ref3, var, avg3); + } + + vec = __msa_hadd_s_w(avg0, avg0); + vec += __msa_hadd_s_w(avg1, avg1); + vec += __msa_hadd_s_w(avg2, avg2); + vec += __msa_hadd_s_w(avg3, avg3); + *diff = HADD_SW_S32(vec); + + return HADD_SW_S32(var); +} + +static uint32_t get_mb_ss_msa(const int16_t *src) { + uint32_t sum, cnt; + v8i16 src0, src1, src2, src3; + v4i32 src0_l, src1_l, src2_l, src3_l; + v4i32 src0_r, src1_r, src2_r, src3_r; + v2i64 sq_src_l = { 0 }; + v2i64 sq_src_r = { 0 }; + + for (cnt = 8; cnt--;) { + LD_SH4(src, 8, src0, src1, src2, src3); + src += 4 * 8; + + UNPCK_SH_SW(src0, src0_l, src0_r); + UNPCK_SH_SW(src1, src1_l, src1_r); + UNPCK_SH_SW(src2, src2_l, src2_r); + UNPCK_SH_SW(src3, src3_l, src3_r); + + DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r); + DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r); + } + + sq_src_l += __msa_splati_d(sq_src_l, 1); + sq_src_r += __msa_splati_d(sq_src_r, 1); + + sum = __msa_copy_s_d(sq_src_l, 0); + sum += __msa_copy_s_d(sq_src_r, 0); + + return sum; +} + +static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LW4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + INSERT_W4_UB(src0, src1, src2, src3, src); + INSERT_W4_UB(ref0, ref1, ref2, ref3, ref); + CALC_MSE_B(src, ref, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB4(src_ptr, src_stride, src0, src1, src2, src3); + src_ptr += (4 * src_stride); + LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + ref_ptr += (4 * ref_stride); + + PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1, + ref0, ref1); + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src, ref; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + + src = LD_UB(src_ptr); + src_ptr += src_stride; + ref = LD_UB(ref_ptr); + ref_ptr += ref_stride; + CALC_MSE_B(src, ref, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, ref0, ref1; + v4i32 var = { 0 }; + + for (ht_cnt = (height >> 2); ht_cnt--;) { + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + + LD_UB2(src_ptr, 16, src0, src1); + src_ptr += src_stride; + LD_UB2(ref_ptr, 16, ref0, ref1); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src1, ref1, var); + } + + return HADD_SW_S32(var); +} + +static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride, + int32_t height) { + int32_t ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for (ht_cnt = height >> 1; ht_cnt--;) { + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src2, ref2, var); + CALC_MSE_B(src1, ref1, var); + CALC_MSE_B(src3, ref3, var); + + LD_UB4(src_ptr, 16, src0, src1, src2, src3); + src_ptr += src_stride; + LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3); + ref_ptr += ref_stride; + CALC_MSE_B(src0, ref0, var); + CALC_MSE_B(src2, ref2, var); + CALC_MSE_B(src1, ref1, var); + CALC_MSE_B(src3, ref3, var); + } + + return HADD_SW_S32(var); +} + +uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride, + const uint8_t *ref_ptr, int32_t ref_stride) { + uint32_t err = 0; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16i8 src = { 0 }; + v16i8 ref = { 0 }; + v16u8 src_vec0, src_vec1; + v8i16 diff0, diff1; + v4i32 err0 = { 0 }; + v4i32 err1 = { 0 }; + + LW4(src_ptr, src_stride, src0, src1, src2, src3); + LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); + INSERT_W4_SB(src0, src1, src2, src3, src); + INSERT_W4_SB(ref0, ref1, ref2, ref3, ref); + ILVRL_B2_UB(src, ref, src_vec0, src_vec1); + HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1); + DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1); + err = HADD_SW_S32(err0); + err += HADD_SW_S32(err1); + + return err; +} + +#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); +#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5); +#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6); +#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7); +#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8); + +#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9); +#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10); +#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11); +#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12); + +#define AOM_VARIANCE_WDXHT_MSA(wd, ht) \ + uint32_t aom_variance##wd##x##ht##_msa( \ + const uint8_t *src, int32_t src_stride, const uint8_t *ref, \ + int32_t ref_stride, uint32_t *sse) { \ + int32_t diff; \ + \ + *sse = \ + sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \ + \ + return VARIANCE_##wd##Wx##ht##H(*sse, diff); \ + } + +/* clang-format off */ +AOM_VARIANCE_WDXHT_MSA(4, 4) +AOM_VARIANCE_WDXHT_MSA(4, 8) + +AOM_VARIANCE_WDXHT_MSA(8, 4) +AOM_VARIANCE_WDXHT_MSA(8, 8) +AOM_VARIANCE_WDXHT_MSA(8, 16) + +AOM_VARIANCE_WDXHT_MSA(16, 8) +AOM_VARIANCE_WDXHT_MSA(16, 16) +AOM_VARIANCE_WDXHT_MSA(16, 32) + +AOM_VARIANCE_WDXHT_MSA(32, 16) +AOM_VARIANCE_WDXHT_MSA(32, 32) +/* clang-format on */ + +uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_32Wx64H(*sse, diff); +} + +uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx32H(*sse, diff); +} + +uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + int32_t diff; + + *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff); + + return VARIANCE_64Wx64H(*sse, diff); +} + +uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse) { + *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8); + + return *sse; +} + +uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8); + + return *sse; +} + +uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, + uint32_t *sse) { + *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16); + + return *sse; +} + +void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum); +} + +void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride, + const uint8_t *ref, int32_t ref_stride, uint32_t *sse, + int32_t *sum) { + *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum); +} + +uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); } diff --git a/third_party/aom/aom_dsp/postproc.h b/third_party/aom/aom_dsp/postproc.h new file mode 100644 index 000000000..11a8c5ad7 --- /dev/null +++ b/third_party/aom/aom_dsp/postproc.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_POSTPROC_H_ +#define AOM_DSP_POSTPROC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// Fills a noise buffer with gaussian noise strength determined by sigma. +int aom_setup_noise(double sigma, int size, char *noise); + +#ifdef __cplusplus +} +#endif + +#endif // AOM_DSP_POSTPROC_H_ diff --git a/third_party/aom/aom_dsp/prob.c b/third_party/aom/aom_dsp/prob.c new file mode 100644 index 000000000..c60bfdac5 --- /dev/null +++ b/third_party/aom/aom_dsp/prob.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" + +#if CONFIG_EC_MULTISYMBOL +#include +#endif + +#include "aom_dsp/prob.h" + +const uint8_t aom_norm[256] = { + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static unsigned int tree_merge_probs_impl(unsigned int i, + const aom_tree_index *tree, + const aom_prob *pre_probs, + const unsigned int *counts, + aom_prob *probs) { + const int l = tree[i]; + const unsigned int left_count = + (l <= 0) ? counts[-l] + : tree_merge_probs_impl(l, tree, pre_probs, counts, probs); + const int r = tree[i + 1]; + const unsigned int right_count = + (r <= 0) ? counts[-r] + : tree_merge_probs_impl(r, tree, pre_probs, counts, probs); + const unsigned int ct[2] = { left_count, right_count }; + probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct); + return left_count + right_count; +} + +void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs, + const unsigned int *counts, aom_prob *probs) { + tree_merge_probs_impl(0, tree, pre_probs, counts, probs); +} + +#if CONFIG_EC_MULTISYMBOL +typedef struct tree_node tree_node; + +struct tree_node { + aom_tree_index index; + uint8_t probs[16]; + uint8_t prob; + int path; + int len; + int l; + int r; + aom_cdf_prob pdf; +}; + +/* Compute the probability of this node in Q23 */ +static uint32_t tree_node_prob(tree_node n, int i) { + uint32_t prob; + /* 1.0 in Q23 */ + prob = 16777216; + for (; i < n.len; i++) { + prob = prob * n.probs[i] >> 8; + } + return prob; +} + +static int tree_node_cmp(tree_node a, tree_node b) { + int i; + uint32_t pa; + uint32_t pb; + for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) { + } + pa = tree_node_prob(a, i); + pb = tree_node_prob(b, i); + return pa > pb ? 1 : pa < pb ? -1 : 0; +} + +/* Given a Q15 probability for symbol subtree rooted at tree[n], this function + computes the probability of each symbol (defined as a node that has no + children). */ +static aom_cdf_prob tree_node_compute_probs(tree_node *tree, int n, + aom_cdf_prob pdf) { + if (tree[n].l == 0) { + /* This prevents probability computations in Q15 that underflow from + producing a symbol that has zero probability. */ + if (pdf == 0) pdf = 1; + tree[n].pdf = pdf; + return pdf; + } else { + /* We process the smaller probability first, */ + if (tree[n].prob < 128) { + aom_cdf_prob lp; + aom_cdf_prob rp; + lp = (((uint32_t)pdf) * tree[n].prob + 128) >> 8; + lp = tree_node_compute_probs(tree, tree[n].l, lp); + rp = tree_node_compute_probs(tree, tree[n].r, lp > pdf ? 0 : pdf - lp); + return lp + rp; + } else { + aom_cdf_prob rp; + aom_cdf_prob lp; + rp = (((uint32_t)pdf) * (256 - tree[n].prob) + 128) >> 8; + rp = tree_node_compute_probs(tree, tree[n].r, rp); + lp = tree_node_compute_probs(tree, tree[n].l, rp > pdf ? 0 : pdf - rp); + return lp + rp; + } + } +} + +static int tree_node_extract(tree_node *tree, int n, int symb, + aom_cdf_prob *pdf, aom_tree_index *index, + int *path, int *len) { + if (tree[n].l == 0) { + pdf[symb] = tree[n].pdf; + if (index != NULL) index[symb] = tree[n].index; + if (path != NULL) path[symb] = tree[n].path; + if (len != NULL) len[symb] = tree[n].len; + return symb + 1; + } else { + symb = tree_node_extract(tree, tree[n].l, symb, pdf, index, path, len); + return tree_node_extract(tree, tree[n].r, symb, pdf, index, path, len); + } +} + +int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs, + aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *index, + int *path, int *len) { + tree_node symb[2 * 16 - 1]; + int nodes; + int next[16]; + int size; + int nsymbs; + int i; + /* Create the root node with probability 1 in Q15. */ + symb[0].index = root; + symb[0].path = 0; + symb[0].len = 0; + symb[0].l = symb[0].r = 0; + nodes = 1; + next[0] = 0; + size = 1; + nsymbs = 1; + while (size > 0 && nsymbs < 16) { + int m; + tree_node n; + aom_tree_index j; + uint8_t prob; + m = 0; + /* Find the internal node with the largest probability. */ + for (i = 1; i < size; i++) { + if (tree_node_cmp(symb[next[i]], symb[next[m]]) > 0) m = i; + } + i = next[m]; + memmove(&next[m], &next[m + 1], sizeof(*next) * (size - (m + 1))); + size--; + /* Split this symbol into two symbols */ + n = symb[i]; + j = n.index; + prob = probs[j >> 1]; + /* Left */ + n.index = tree[j]; + n.path <<= 1; + n.len++; + n.probs[n.len - 1] = prob; + symb[nodes] = n; + if (n.index > 0) { + next[size++] = nodes; + } + /* Right */ + n.index = tree[j + 1]; + n.path += 1; + n.probs[n.len - 1] = 256 - prob; + symb[nodes + 1] = n; + if (n.index > 0) { + next[size++] = nodes + 1; + } + symb[i].prob = prob; + symb[i].l = nodes; + symb[i].r = nodes + 1; + nodes += 2; + nsymbs++; + } + /* Compute the probabilities of each symbol in Q15 */ + tree_node_compute_probs(symb, 0, CDF_PROB_TOP); + /* Extract the cdf, index, path and length */ + tree_node_extract(symb, 0, 0, cdf, index, path, len); + /* Convert to CDF */ + cdf[0] = AOM_ICDF(cdf[0]); + for (i = 1; i < nsymbs; i++) { + cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i - 1]) + cdf[i]); + } +// Store symbol count at the end of the CDF +#if CONFIG_EC_ADAPT + cdf[nsymbs] = 0; +#endif + return nsymbs; +} + +/* This code assumes that tree contains as unique leaf nodes the integer values + 0 to len - 1 and produces the forward and inverse mapping tables in ind[] + and inv[] respectively. */ +static void tree_to_index(int *stack_index, int *ind, int *inv, + const aom_tree_index *tree, int value, int index) { + value *= 2; + + do { + const aom_tree_index content = tree[index]; + ++index; + if (content <= 0) { + inv[*stack_index] = -content; + ind[-content] = *stack_index; + ++(*stack_index); + } else { + tree_to_index(stack_index, ind, inv, tree, value, content); + } + } while (++value & 1); +} + +void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree) { + int stack_index = 0; + tree_to_index(&stack_index, ind, inv, tree, 0, 0); +} +#endif diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h new file mode 100644 index 000000000..808592923 --- /dev/null +++ b/third_party/aom/aom_dsp/prob.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_PROB_H_ +#define AOM_DSP_PROB_H_ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_common.h" + +#include "aom_ports/bitops.h" +#include "aom_ports/mem.h" + +#if CONFIG_DAALA_EC +#include "aom_dsp/entcode.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint8_t aom_prob; + +// TODO(negge): Rename this aom_prob once we remove vpxbool. +typedef uint16_t aom_cdf_prob; + +#if CONFIG_EC_MULTISYMBOL +#define CDF_SIZE(x) ((x) + 1) +#endif + +#define CDF_PROB_BITS 15 +#define CDF_PROB_TOP (1 << CDF_PROB_BITS) + +#if CONFIG_DAALA_EC +#define AOM_ICDF OD_ICDF +#else +#define AOM_ICDF(x) (x) +#endif + +#define MAX_PROB 255 + +#define aom_prob_half ((aom_prob)128) + +typedef int8_t aom_tree_index; + +#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count)) + +#define MODE_MV_COUNT_SAT 20 + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of aom_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const aom_tree_index aom_tree[]; + +static INLINE aom_prob get_prob(unsigned int num, unsigned int den) { + assert(den != 0); + { + const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); + // (p > 255) ? 255 : (p < 1) ? 1 : p; + const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); + return (aom_prob)clipped_prob; + } +} + +static INLINE aom_prob get_binary_prob(unsigned int n0, unsigned int n1) { + const unsigned int den = n0 + n1; + if (den == 0) return 128u; + return get_prob(n0, den); +} + +/* This function assumes prob1 and prob2 are already within [1,255] range. */ +static INLINE aom_prob weighted_prob(int prob1, int prob2, int factor) { + return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); +} + +static INLINE aom_prob merge_probs(aom_prob pre_prob, const unsigned int ct[2], + unsigned int count_sat, + unsigned int max_update_factor) { + const aom_prob prob = get_binary_prob(ct[0], ct[1]); + const unsigned int count = AOMMIN(ct[0] + ct[1], count_sat); + const unsigned int factor = max_update_factor * count / count_sat; + return weighted_prob(pre_prob, prob, factor); +} + +// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT; +static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = { + 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, + 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 +}; + +static INLINE aom_prob mode_mv_merge_probs(aom_prob pre_prob, + const unsigned int ct[2]) { + const unsigned int den = ct[0] + ct[1]; + if (den == 0) { + return pre_prob; + } else { + const unsigned int count = AOMMIN(den, MODE_MV_COUNT_SAT); + const unsigned int factor = count_to_update_factor[count]; + const aom_prob prob = get_prob(ct[0], den); + return weighted_prob(pre_prob, prob, factor); + } +} + +void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs, + const unsigned int *counts, aom_prob *probs); + +#if CONFIG_EC_MULTISYMBOL +int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs, + aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind, + int *pth, int *len); + +static INLINE void av1_tree_to_cdf(const aom_tree_index *tree, + const aom_prob *probs, aom_cdf_prob *cdf) { + aom_tree_index index[16]; + int path[16]; + int dist[16]; + tree_to_cdf(tree, probs, 0, cdf, index, path, dist); +} + +#define av1_tree_to_cdf_1D(tree, probs, cdf, u) \ + do { \ + int i; \ + for (i = 0; i < u; i++) { \ + av1_tree_to_cdf(tree, probs[i], cdf[i]); \ + } \ + } while (0) + +#define av1_tree_to_cdf_2D(tree, probs, cdf, v, u) \ + do { \ + int j; \ + int i; \ + for (j = 0; j < v; j++) { \ + for (i = 0; i < u; i++) { \ + av1_tree_to_cdf(tree, probs[j][i], cdf[j][i]); \ + } \ + } \ + } while (0) + +void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree); +#endif + +DECLARE_ALIGNED(16, extern const uint8_t, aom_norm[256]); + +#if CONFIG_EC_ADAPT +static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { + const int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs); + const int rate2 = 5; + int i, tmp; + int diff; +#if 1 + const int tmp0 = 1 << rate2; + tmp = AOM_ICDF(tmp0); + diff = ((CDF_PROB_TOP - (nsymbs << rate2)) >> rate) << rate; +// Single loop (faster) +#if CONFIG_DAALA_EC && CONFIG_EC_SMALLMUL + for (i = 0; i < nsymbs - 1; ++i, tmp -= tmp0) { + tmp -= (i == val ? diff : 0); + cdf[i] += ((tmp - cdf[i]) >> rate); + } +#else + for (i = 0; i < nsymbs - 1; ++i, tmp += tmp0) { + tmp += (i == val ? diff : 0); + cdf[i] -= ((cdf[i] - tmp) >> rate); + } +#endif +#else + for (i = 0; i < nsymbs; ++i) { + tmp = (i + 1) << rate2; + cdf[i] -= ((cdf[i] - tmp) >> rate); + } + diff = CDF_PROB_TOP - cdf[nsymbs - 1]; + + for (i = val; i < nsymbs; ++i) { + cdf[i] += diff; + } +#endif + cdf[nsymbs] += (cdf[nsymbs] < 32); +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_PROB_H_ diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c new file mode 100644 index 000000000..461c13729 --- /dev/null +++ b/third_party/aom/aom_dsp/psnr.c @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/psnr.h" +#include "aom_scale/yv12config.h" + +double aom_sse_to_psnr(double samples, double peak, double sse) { + if (sse > 0.0) { + const double psnr = 10.0 * log10(samples * peak * peak / sse); + return psnr > MAX_PSNR ? MAX_PSNR : psnr; + } else { + return MAX_PSNR; + } +} + +/* TODO(yaowu): The block_variance calls the unoptimized versions of variance() +* and highbd_8_variance(). It should not. +*/ +static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, unsigned int *sse, + int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +#if CONFIG_HIGHBITDEPTH +static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h, uint64_t *sse, int64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h, unsigned int *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, + &sum_long); + *sse = (unsigned int)sse_long; + *sum = (int)sum_long; +} +#endif // CONFIG_HIGHBITDEPTH + +static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + const int dw = width % 16; + const int dh = height % 16; + int64_t total_sse = 0; + unsigned int sse = 0; + int sum = 0; + int x, y; + + if (dw > 0) { + encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, + height, &sse, &sum); + total_sse += sse; + } + + if (dh > 0) { + encoder_variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh, + &sse, &sum); + total_sse += sse; + } + + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + aom_mse16x16(pa, a_stride, pb, b_stride, &sse); + total_sse += sse; + + pa += 16; + pb += 16; + } + + a += 16 * a_stride; + b += 16 * b_stride; + } + + return total_sse; +} + +#if CONFIG_HIGHBITDEPTH +static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height, unsigned int input_shift) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t total_sse = 0; + int x, y; + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + int64_t diff; + diff = (a[x] >> input_shift) - (b[x] >> input_shift); + total_sse += diff * diff; + } + a += a_stride; + b += b_stride; + } + return total_sse; +} + +static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int64_t total_sse = 0; + int x, y; + const int dw = width % 16; + const int dh = height % 16; + unsigned int sse = 0; + int sum = 0; + if (dw > 0) { + encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height, &sse, &sum); + total_sse += sse; + } + if (dh > 0) { + encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh, &sse, &sum); + total_sse += sse; + } + for (y = 0; y < height / 16; ++y) { + const uint8_t *pa = a; + const uint8_t *pb = b; + for (x = 0; x < width / 16; ++x) { + aom_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); + total_sse += sse; + pa += 16; + pb += 16; + } + a += 16 * a_stride; + b += 16 * b_stride; + } + return total_sse; +} +#endif // CONFIG_HIGHBITDEPTH + +int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, + b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, + width, height); +} + +int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + + return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} + +int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride, + b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride, + width, height); +} + +int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + + return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height) { + return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride, + b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride, + width, height); +} + +int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + + return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +#if CONFIG_HIGHBITDEPTH +int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse( + a->y_buffer + vstart * a->y_stride + hstart, a->y_stride, + b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height); +} + +int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->y_crop_width == b->y_crop_width); + assert(a->y_crop_height == b->y_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride, + a->y_crop_width, a->y_crop_height); +} + +int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, + b->u_buffer + vstart * b->uv_stride + hstart, + b->uv_stride, width, height); +} + +int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} + +int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height) { + return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart, + a->uv_stride, + b->v_buffer + vstart * b->uv_stride + hstart, + b->uv_stride, width, height); +} + +int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b) { + assert(a->uv_crop_width == b->uv_crop_width); + assert(a->uv_crop_height == b->uv_crop_height); + assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0); + + return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, + a->uv_crop_width, a->uv_crop_height); +} +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_HIGHBITDEPTH +void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + uint32_t bit_depth, uint32_t in_bit_depth) { + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; + const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; + const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; + const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; + const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; + const double peak = (double)((1 << in_bit_depth) - 1); + const unsigned int input_shift = bit_depth - in_bit_depth; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + uint64_t sse; + if (a->flags & YV12_FLAG_HIGHBITDEPTH) { + if (input_shift) { + sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i], + b_strides[i], w, h, input_shift); + } else { + sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i], + b_strides[i], w, h); + } + } else { + sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); + } + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = + aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); +} + +#endif // !CONFIG_HIGHBITDEPTH + +void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr) { + static const double peak = 255.0; + const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; + const int heights[3] = { a->y_crop_height, a->uv_crop_height, + a->uv_crop_height }; + const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; + const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; + const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; + const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; + int i; + uint64_t total_sse = 0; + uint32_t total_samples = 0; + + for (i = 0; i < 3; ++i) { + const int w = widths[i]; + const int h = heights[i]; + const uint32_t samples = w * h; + const uint64_t sse = + get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); + psnr->sse[1 + i] = sse; + psnr->samples[1 + i] = samples; + psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); + + total_sse += sse; + total_samples += samples; + } + + psnr->sse[0] = total_sse; + psnr->samples[0] = total_samples; + psnr->psnr[0] = + aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); +} diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h new file mode 100644 index 000000000..480140e6f --- /dev/null +++ b/third_party/aom/aom_dsp/psnr.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_PSNR_H_ +#define AOM_DSP_PSNR_H_ + +#include "aom_scale/yv12config.h" + +#define MAX_PSNR 100.0 + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + double psnr[4]; // total/y/u/v + uint64_t sse[4]; // total/y/u/v + uint32_t samples[4]; // total/y/u/v +} PSNR_STATS; + +/*!\brief Converts SSE to PSNR +* +* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). +* +* \param[in] samples Number of samples +* \param[in] peak Max sample value +* \param[in] sse Sum of squared errors +*/ +double aom_sse_to_psnr(double samples, double peak, double sse); +int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, int width, + int vstart, int height); +int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +#if CONFIG_HIGHBITDEPTH +int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int hstart, + int width, int vstart, int height); +int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b); +void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, + unsigned int bit_depth, unsigned int in_bit_depth); +#endif +void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, + PSNR_STATS *psnr); + +double aom_psnrhvs(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *phvs_y, + double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd); +#ifdef __cplusplus +} // extern "C" +#endif +#endif // AOM_DSP_PSNR_H_ diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c new file mode 100644 index 000000000..aeefd5908 --- /dev/null +++ b/third_party/aom/aom_dsp/psnrhvs.c @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + * + * This code was originally written by: Gregory Maxwell, at the Daala + * project. + */ + +#include +#include +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/psnr.h" +#include "aom_dsp/ssim.h" +#include "aom_ports/system_state.h" + +#if !defined(M_PI) +#define M_PI (3.141592653589793238462643) +#endif +#include + +static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { + int i, j; + (void)xstride; + aom_fdct8x8(x, y, ystride); + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; +} + +#if CONFIG_HIGHBITDEPTH +static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, + int xstride) { + int i, j; + (void)xstride; + aom_highbd_fdct8x8(x, y, ystride); + for (i = 0; i < 8; i++) + for (j = 0; j < 8; j++) + *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; +} +#endif + +/* Normalized inverse quantization matrix for 8x8 DCT at the point of + * transparency. This is not the JPEG based matrix from the paper, + this one gives a slightly higher MOS agreement.*/ +static const double csf_y[8][8] = { + { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334, + 0.678296995242, 0.466224900598, 0.3265091542 }, + { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, + 0.868920337363, 0.61280991668, 0.436405793551 }, + { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257, + 0.670882927016, 0.501731932449, 0.372504254596 }, + { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554, + 0.48309405692, 0.380429446972, 0.295774038565 }, + { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676, + 0.352889268808, 0.283006984131, 0.226951348204 }, + { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692, + 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 }, + { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972, + 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 }, + { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565, + 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 } +}; +static const double csf_cb420[8][8] = { + { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788, + 0.898018824055, 0.74725392039, 0.615105596242 }, + { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972, + 1.17428548929, 0.996404342439, 0.830890433625 }, + { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362, + 0.960060382087, 0.849823426169, 0.731221236837 }, + { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099, + 0.751437590932, 0.685398513368, 0.608694761374 }, + { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187, + 0.605503172737, 0.55002013668, 0.495804539034 }, + { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932, + 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 }, + { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368, + 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 }, + { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374, + 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 } +}; +static const double csf_cr420[8][8] = { + { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469, + 0.867069376285, 0.721500455585, 0.593906509971 }, + { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198, + 1.13381474809, 0.962064122248, 0.802254508198 }, + { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848, + 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 }, + { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195, + 0.725539939514, 0.661776842059, 0.587716619023 }, + { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286, + 0.584635025748, 0.531064164893, 0.478717061273 }, + { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514, + 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 }, + { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059, + 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 }, + { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023, + 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 } +}; + +static double convert_score_db(double _score, double _weight, int bit_depth) { + int16_t pix_max = 255; + assert(_score * _weight >= 0.0); + if (bit_depth == 10) + pix_max = 1023; + else if (bit_depth == 12) + pix_max = 4095; + + if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR; + return 10 * (log10(pix_max * pix_max) - log10(_weight * _score)); +} + +static double calc_psnrhvs(const unsigned char *src, int _systride, + const unsigned char *dst, int _dystride, double _par, + int _w, int _h, int _step, const double _csf[8][8], + uint32_t bit_depth, uint32_t _shift) { + double ret; + const uint8_t *_src8 = src; + const uint8_t *_dst8 = dst; + const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src); + const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst); + int16_t dct_s[8 * 8], dct_d[8 * 8]; + tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8]; + double mask[8][8]; + int pixels; + int x; + int y; + (void)_par; + ret = pixels = 0; + /*In the PSNR-HVS-M paper[1] the authors describe the construction of + their masking table as "we have used the quantization table for the + color component Y of JPEG [6] that has been also obtained on the + basis of CSF. Note that the values in quantization table JPEG have + been normalized and then squared." Their CSF matrix (from PSNR-HVS) + was also constructed from the JPEG matrices. I can not find any obvious + scheme of normalizing to produce their table, but if I multiply their + CSF by 0.38857 and square the result I get their masking table. + I have no idea where this constant comes from, but deviating from it + too greatly hurts MOS agreement. + + [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli, + Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking + of DCT basis functions", CD-ROM Proceedings of the Third + International Workshop on Video Processing and Quality Metrics for Consumer + Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/ + for (x = 0; x < 8; x++) + for (y = 0; y < 8; y++) + mask[x][y] = + (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003); + for (y = 0; y < _h - 7; y += _step) { + for (x = 0; x < _w - 7; x += _step) { + int i; + int j; + double s_means[4]; + double d_means[4]; + double s_vars[4]; + double d_vars[4]; + double s_gmean = 0; + double d_gmean = 0; + double s_gvar = 0; + double d_gvar = 0; + double s_mask = 0; + double d_mask = 0; + for (i = 0; i < 4; i++) + s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + int sub = ((i & 12) >> 2) + ((j & 12) >> 1); + if (bit_depth == 8 && _shift == 0) { + dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)]; + dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)]; + } else if (bit_depth == 10 || bit_depth == 12) { + dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift; + dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift; + } + s_gmean += dct_s[i * 8 + j]; + d_gmean += dct_d[i * 8 + j]; + s_means[sub] += dct_s[i * 8 + j]; + d_means[sub] += dct_d[i * 8 + j]; + } + } + s_gmean /= 64.f; + d_gmean /= 64.f; + for (i = 0; i < 4; i++) s_means[i] /= 16.f; + for (i = 0; i < 4; i++) d_means[i] /= 16.f; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + int sub = ((i & 12) >> 2) + ((j & 12) >> 1); + s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean); + d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean); + s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) * + (dct_s[i * 8 + j] - s_means[sub]); + d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) * + (dct_d[i * 8 + j] - d_means[sub]); + } + } + s_gvar *= 1 / 63.f * 64; + d_gvar *= 1 / 63.f * 64; + for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16; + for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16; + if (s_gvar > 0) + s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar; + if (d_gvar > 0) + d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar; +#if CONFIG_HIGHBITDEPTH + if (bit_depth == 10 || bit_depth == 12) { + hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } +#endif + if (bit_depth == 8) { + od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } + for (i = 0; i < 8; i++) + for (j = (i == 0); j < 8; j++) + s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j]; + for (i = 0; i < 8; i++) + for (j = (i == 0); j < 8; j++) + d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j]; + s_mask = sqrt(s_mask * s_gvar) / 32.f; + d_mask = sqrt(d_mask * d_gvar) / 32.f; + if (d_mask > s_mask) s_mask = d_mask; + for (i = 0; i < 8; i++) { + for (j = 0; j < 8; j++) { + double err; + err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j])); + if (i != 0 || j != 0) + err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j]; + ret += (err * _csf[i][j]) * (err * _csf[i][j]); + pixels++; + } + } + } + } + if (pixels <= 0) return 0; + ret /= pixels; + return ret; +} + +double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst, + double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs, + uint32_t bd, uint32_t in_bd) { + double psnrhvs; + const double par = 1.0; + const int step = 7; + uint32_t bd_shift = 0; + aom_clear_system_state(); + + assert(bd == 8 || bd == 10 || bd == 12); + assert(bd >= in_bd); + + bd_shift = bd - in_bd; + + *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, + dst->y_stride, par, src->y_crop_width, + src->y_crop_height, step, csf_y, bd, bd_shift); + *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, par, src->uv_crop_width, + src->uv_crop_height, step, csf_cb420, bd, bd_shift); + *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, par, src->uv_crop_width, + src->uv_crop_height, step, csf_cr420, bd, bd_shift); + psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs)); + return convert_score_db(psnrhvs, 1.0, in_bd); +} diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c new file mode 100644 index 000000000..0759c22e3 --- /dev/null +++ b/third_party/aom/aom_dsp/quantize.c @@ -0,0 +1,832 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/quantize.h" +#include "aom_mem/aom_mem.h" + +static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, +#if CONFIG_AOM_QM + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, +#endif + const int log_scale) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int i, non_zero_count = (int)n_coeffs, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; +#if CONFIG_AOM_QM + const qm_val_t wt = qm_ptr[rc]; + const int coeff = coeff_ptr[rc] * wt; +#else + const int coeff = coeff_ptr[rc]; +#endif // CONFIG_AOM_QM + +#if CONFIG_AOM_QM + if (coeff < (zbins[rc != 0] << AOM_QM_BITS) && + coeff > (nzbins[rc != 0] << AOM_QM_BITS)) + non_zero_count--; +#else + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) non_zero_count--; +#endif // CONFIG_AOM_QM + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32; + +#if CONFIG_AOM_QM + const qm_val_t wt = qm_ptr[rc]; + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { +#else + if (abs_coeff >= zbins[rc != 0]) { +#endif // CONFIG_AOM_QM + int64_t tmp = + clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); +#if CONFIG_AOM_QM + tmp *= wt; + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); // quantization +#else + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale)); // quantization +#endif // CONFIG_AOM_QM + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; +#if CONFIG_AOM_QM + const int dequant = + (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale); +#else + dqcoeff_ptr[rc] = + qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (1 << log_scale); +#endif // CONFIG_AOM_QM + + if (tmp32) eob = i; + } + } + } + *eob_ptr = eob + 1; +} + +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan +#if CONFIG_AOM_QM + , + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr +#endif + ) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + 0); +} + +void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan +#if CONFIG_AOM_QM + , + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr +#endif + ) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + 1); +} + +#if CONFIG_TX64X64 +void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan +#if CONFIG_AOM_QM + , + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr +#endif + ) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, +#if CONFIG_AOM_QM + qm_ptr, iqm_ptr, +#endif + 2); +} +#endif // CONFIG_TX64X64 + +#if CONFIG_AOM_QM +void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp, eob = -1; + int32_t tmp32; + int dequant = + (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (16 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; + if (tmp32) eob = 0; + } + *eob_ptr = eob + 1; +} + +void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { + const int n_coeffs = 1024; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp, eob = -1; + int32_t tmp32; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), + INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (15 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dequant = + (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2; + if (tmp32) eob = 0; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { + const int n_coeffs = 1024; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp, eob = -1; + int32_t tmp32; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2), + INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (14 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dequant = + (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4; + if (tmp32) eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + int eob = -1; + int dequant = + (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[0]; + const uint32_t abs_qcoeff = + (uint32_t)((tmp * qm_ptr[0] * quant) >> (16 + AOM_QM_BITS)); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + const int n_coeffs = 1024; + int eob = -1; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); + const uint32_t abs_qcoeff = + (uint32_t)((tmp * qm_ptr[0] * quant) >> (15 + AOM_QM_BITS)); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 2; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + const int n_coeffs = 1024; + int eob = -1; + int dequant; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2); + const uint32_t abs_qcoeff = + (uint32_t)((tmp * qm_ptr[0] * quant) >> (14 + AOM_QM_BITS)); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 4; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { + int i, non_zero_count = (int)n_coeffs, eob = -1; + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int dequant; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr[rc]; + const int coeff = coeff_ptr[rc] * wt; + + if (coeff < (zbins[rc != 0] << AOM_QM_BITS) && + coeff > (nzbins[rc != 0] << AOM_QM_BITS)) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const qm_val_t wt = qm_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; + if (abs_qcoeff) eob = i; + } + } + } + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + int dequant; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr[rc]; + const int coeff = coeff_ptr[rc] * wt; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) || + coeff <= (nzbins[rc != 0] << AOM_QM_BITS)) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const qm_val_t wt = qm_ptr[rc]; + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (15 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2; + if (abs_qcoeff) eob = idx_arr[i]; + } + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_highbd_quantize_b_64x64_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2), + ROUND_POWER_OF_TWO(zbin_ptr[1], 2) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + + int idx = 0; + int idx_arr[4096]; + int i, eob = -1; + int dequant; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr[rc]; + const int coeff = coeff_ptr[rc] * wt; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) || + coeff <= (nzbins[rc != 0] << AOM_QM_BITS)) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const qm_val_t wt = qm_ptr[rc]; + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (14 + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4; + if (abs_qcoeff) eob = idx_arr[i]; + } + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 +#endif // CONFIG_HIGHBITDEPTH + +#else // CONFIG_AOM_QM + +void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; + if (tmp) eob = 0; + } + *eob_ptr = eob + 1; +} + +void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int n_coeffs = 1024; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), + INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; + if (tmp) eob = 0; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + const int n_coeffs = 4096; + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp, eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2), + INT16_MIN, INT16_MAX); + tmp = (tmp * quant) >> 14; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 4; + if (tmp) eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr) { + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + round_ptr[0]; + const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, + uint16_t *eob_ptr) { + const int n_coeffs = 1024; + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); + const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, + uint16_t *eob_ptr) { + const int n_coeffs = 4096; + int eob = -1; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + const int coeff = coeff_ptr[0]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2); + const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 14); + qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 4; + if (abs_qcoeff) eob = 0; + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 + +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, non_zero_count = (int)n_coeffs, eob = -1; + const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= zbins[rc != 0]) { + const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + if (abs_qcoeff) eob = i; + } + } + } + *eob_ptr = eob + 1; +} + +void aom_highbd_quantize_b_32x32_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), + ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = idx_arr[i]; + } + } + *eob_ptr = eob + 1; +} + +#if CONFIG_TX64X64 +void aom_highbd_quantize_b_64x64_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2), + ROUND_POWER_OF_TWO(zbin_ptr[1], 2) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + + int idx = 0; + int idx_arr[4096]; + int i, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; + if (abs_qcoeff) eob = idx_arr[i]; + } + } + *eob_ptr = eob + 1; +} +#endif // CONFIG_TX64X64 +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AOM_QM diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h new file mode 100644 index 000000000..fe49b830f --- /dev/null +++ b/third_party/aom/aom_dsp/quantize.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_QUANTIZE_H_ +#define AOM_DSP_QUANTIZE_H_ + +#include "./aom_config.h" +#include "aom_dsp/aom_dsp_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if CONFIG_AOM_QM +void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); +void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); +#if CONFIG_TX64X64 +void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); +#endif // CONFIG_TX64X64 +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr); +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr); +void aom_highbd_quantize_dc_32x32( + const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, + const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr); +#if CONFIG_TX64X64 +void aom_highbd_quantize_dc_64x64( + const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, + const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr); +#endif // CONFIG_TX64X64 +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, + const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); +#endif // CONFIG_HIGHBITDEPTH + +#else // CONFIG_AOM_QM + +void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +#if CONFIG_TX64X64 +void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +#endif // CONFIG_TX64X64 +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr); +void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, + const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +#if CONFIG_TX64X64 +void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, + const int16_t quant_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr); +#endif // CONFIG_TX64X64 +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AOM_QM + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_QUANTIZE_H_ diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c new file mode 100644 index 000000000..3e1070519 --- /dev/null +++ b/third_party/aom/aom_dsp/sad.c @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +/* Sum the difference between every corresponding element of the buffers. */ +static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +#define sadMxN(m, n) \ + unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint8_t comp_pred[m * n]; \ + aom_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ + return sad(src, src_stride, comp_pred, m, m, n); \ + } + +// depending on call sites, pass **ref_array to avoid & in subsequent call and +// de-dup with 4D below. +#define sadMxNxK(m, n, k) \ + void aom_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref_array, int ref_stride, \ + uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) \ + sad_array[i] = \ + aom_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \ + } + +// This appears to be equivalent to the above when k == 4 and refs is const +#define sadMxNx4D(m, n) \ + void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) \ + sad_array[i] = \ + aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \ + } + +/* clang-format off */ +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +// 128x128 +sadMxN(128, 128) +sadMxNxK(128, 128, 3) +sadMxNxK(128, 128, 8) +sadMxNx4D(128, 128) + +// 128x64 +sadMxN(128, 64) +sadMxNx4D(128, 64) + +// 64x128 +sadMxN(64, 128) +sadMxNx4D(64, 128) +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION + +// 64x64 +sadMxN(64, 64) +sadMxNxK(64, 64, 3) +sadMxNxK(64, 64, 8) +sadMxNx4D(64, 64) + +// 64x32 +sadMxN(64, 32) +sadMxNx4D(64, 32) + +// 32x64 +sadMxN(32, 64) +sadMxNx4D(32, 64) + +// 32x32 +sadMxN(32, 32) +sadMxNxK(32, 32, 3) +sadMxNxK(32, 32, 8) +sadMxNx4D(32, 32) + +// 32x16 +sadMxN(32, 16) +sadMxNx4D(32, 16) + +// 16x32 +sadMxN(16, 32) +sadMxNx4D(16, 32) + +// 16x16 +sadMxN(16, 16) +sadMxNxK(16, 16, 3) +sadMxNxK(16, 16, 8) +sadMxNx4D(16, 16) + +// 16x8 +sadMxN(16, 8) +sadMxNxK(16, 8, 3) +sadMxNxK(16, 8, 8) +sadMxNx4D(16, 8) + +// 8x16 +sadMxN(8, 16) +sadMxNxK(8, 16, 3) +sadMxNxK(8, 16, 8) +sadMxNx4D(8, 16) + +// 8x8 +sadMxN(8, 8) +sadMxNxK(8, 8, 3) +sadMxNxK(8, 8, 8) +sadMxNx4D(8, 8) + +// 8x4 +sadMxN(8, 4) +sadMxNxK(8, 4, 8) +sadMxNx4D(8, 4) + +// 4x8 +sadMxN(4, 8) +sadMxNxK(4, 8, 8) +sadMxNx4D(4, 8) + +// 4x4 +sadMxN(4, 4) +sadMxNxK(4, 4, 3) +sadMxNxK(4, 4, 8) +sadMxNx4D(4, 4) +/* clang-format on */ + +#if CONFIG_HIGHBITDEPTH + static INLINE + unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, + const uint16_t *b, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + return sad; +} + +#define highbd_sadMxN(m, n) \ + unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int aom_highbd_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint16_t comp_pred[m * n]; \ + aom_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ + return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ + } + +#define highbd_sadMxNxK(m, n, k) \ + void aom_highbd_sad##m##x##n##x##k##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref_array, \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < k; ++i) { \ + sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \ + &ref_array[i], ref_stride); \ + } \ + } + +#define highbd_sadMxNx4D(m, n) \ + void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ + const uint8_t *const ref_array[], \ + int ref_stride, uint32_t *sad_array) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \ + ref_array[i], ref_stride); \ + } \ + } + +/* clang-format off */ +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +// 128x128 +highbd_sadMxN(128, 128) +highbd_sadMxNxK(128, 128, 3) +highbd_sadMxNxK(128, 128, 8) +highbd_sadMxNx4D(128, 128) + +// 128x64 +highbd_sadMxN(128, 64) +highbd_sadMxNx4D(128, 64) + +// 64x128 +highbd_sadMxN(64, 128) +highbd_sadMxNx4D(64, 128) +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION + +// 64x64 +highbd_sadMxN(64, 64) +highbd_sadMxNxK(64, 64, 3) +highbd_sadMxNxK(64, 64, 8) +highbd_sadMxNx4D(64, 64) + +// 64x32 +highbd_sadMxN(64, 32) +highbd_sadMxNx4D(64, 32) + +// 32x64 +highbd_sadMxN(32, 64) +highbd_sadMxNx4D(32, 64) + +// 32x32 +highbd_sadMxN(32, 32) +highbd_sadMxNxK(32, 32, 3) +highbd_sadMxNxK(32, 32, 8) +highbd_sadMxNx4D(32, 32) + +// 32x16 +highbd_sadMxN(32, 16) +highbd_sadMxNx4D(32, 16) + +// 16x32 +highbd_sadMxN(16, 32) +highbd_sadMxNx4D(16, 32) + +// 16x16 +highbd_sadMxN(16, 16) +highbd_sadMxNxK(16, 16, 3) +highbd_sadMxNxK(16, 16, 8) +highbd_sadMxNx4D(16, 16) + +// 16x8 +highbd_sadMxN(16, 8) +highbd_sadMxNxK(16, 8, 3) +highbd_sadMxNxK(16, 8, 8) +highbd_sadMxNx4D(16, 8) + +// 8x16 +highbd_sadMxN(8, 16) +highbd_sadMxNxK(8, 16, 3) +highbd_sadMxNxK(8, 16, 8) +highbd_sadMxNx4D(8, 16) + +// 8x8 +highbd_sadMxN(8, 8) +highbd_sadMxNxK(8, 8, 3) +highbd_sadMxNxK(8, 8, 8) +highbd_sadMxNx4D(8, 8) + +// 8x4 +highbd_sadMxN(8, 4) +highbd_sadMxNxK(8, 4, 8) +highbd_sadMxNx4D(8, 4) + +// 4x8 +highbd_sadMxN(4, 8) +highbd_sadMxNxK(4, 8, 8) +highbd_sadMxNx4D(4, 8) + +// 4x4 +highbd_sadMxN(4, 4) +highbd_sadMxNxK(4, 4, 3) +highbd_sadMxNxK(4, 4, 8) +highbd_sadMxNx4D(4, 4) +/* clang-format on */ +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_AV1 && CONFIG_EXT_INTER + static INLINE + unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, const uint8_t *m, int m_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + m += m_stride; + } + sad = (sad + 31) >> 6; + + return sad; +} + +#define MASKSADMxN(m, n) \ + unsigned int aom_masked_sad##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, \ + n); \ + } + +/* clang-format off */ +#if CONFIG_EXT_PARTITION +MASKSADMxN(128, 128) +MASKSADMxN(128, 64) +MASKSADMxN(64, 128) +#endif // CONFIG_EXT_PARTITION +MASKSADMxN(64, 64) +MASKSADMxN(64, 32) +MASKSADMxN(32, 64) +MASKSADMxN(32, 32) +MASKSADMxN(32, 16) +MASKSADMxN(16, 32) +MASKSADMxN(16, 16) +MASKSADMxN(16, 8) +MASKSADMxN(8, 16) +MASKSADMxN(8, 8) +MASKSADMxN(8, 4) +MASKSADMxN(4, 8) +MASKSADMxN(4, 4) +/* clang-format on */ + +#if CONFIG_HIGHBITDEPTH + static INLINE + unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int width, + int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + m += m_stride; + } + sad = (sad + 31) >> 6; + + return sad; +} + +#define HIGHBD_MASKSADMXN(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad(src, src_stride, ref, ref_stride, msk, \ + msk_stride, m, n); \ + } + +#if CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN(128, 128) +HIGHBD_MASKSADMXN(128, 64) +HIGHBD_MASKSADMXN(64, 128) +#endif // CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN(64, 64) +HIGHBD_MASKSADMXN(64, 32) +HIGHBD_MASKSADMXN(32, 64) +HIGHBD_MASKSADMXN(32, 32) +HIGHBD_MASKSADMXN(32, 16) +HIGHBD_MASKSADMXN(16, 32) +HIGHBD_MASKSADMXN(16, 16) +HIGHBD_MASKSADMXN(16, 8) +HIGHBD_MASKSADMXN(8, 16) +HIGHBD_MASKSADMXN(8, 8) +HIGHBD_MASKSADMXN(8, 4) +HIGHBD_MASKSADMXN(4, 8) +HIGHBD_MASKSADMXN(4, 4) +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AV1 && CONFIG_EXT_INTER + +#if CONFIG_AV1 && CONFIG_MOTION_VAR +// pre: predictor being evaluated +// wsrc: target weighted prediction (has been *4096 to keep precision) +// mask: 2d weights (scaled by 4096) +static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); + + pre += pre_stride; + wsrc += width; + mask += width; + } + + return sad; +} + +#define OBMCSADMxN(m, n) \ + unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *mask) { \ + return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } + +/* clang-format off */ +#if CONFIG_EXT_PARTITION +OBMCSADMxN(128, 128) +OBMCSADMxN(128, 64) +OBMCSADMxN(64, 128) +#endif // CONFIG_EXT_PARTITION +OBMCSADMxN(64, 64) +OBMCSADMxN(64, 32) +OBMCSADMxN(32, 64) +OBMCSADMxN(32, 32) +OBMCSADMxN(32, 16) +OBMCSADMxN(16, 32) +OBMCSADMxN(16, 16) +OBMCSADMxN(16, 8) +OBMCSADMxN(8, 16) +OBMCSADMxN(8, 8) +OBMCSADMxN(8, 4) +OBMCSADMxN(4, 8) +OBMCSADMxN(4, 4) +/* clang-format on */ + +#if CONFIG_HIGHBITDEPTH + static INLINE + unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); + + pre += pre_stride; + wsrc += width; + mask += width; + } + + return sad; +} + +#define HIGHBD_OBMCSADMXN(m, n) \ + unsigned int aom_highbd_obmc_sad##m##x##n##_c( \ + const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } + +/* clang-format off */ +#if CONFIG_EXT_PARTITION +HIGHBD_OBMCSADMXN(128, 128) +HIGHBD_OBMCSADMXN(128, 64) +HIGHBD_OBMCSADMXN(64, 128) +#endif // CONFIG_EXT_PARTITION +HIGHBD_OBMCSADMXN(64, 64) +HIGHBD_OBMCSADMXN(64, 32) +HIGHBD_OBMCSADMXN(32, 64) +HIGHBD_OBMCSADMXN(32, 32) +HIGHBD_OBMCSADMXN(32, 16) +HIGHBD_OBMCSADMXN(16, 32) +HIGHBD_OBMCSADMXN(16, 16) +HIGHBD_OBMCSADMXN(16, 8) +HIGHBD_OBMCSADMXN(8, 16) +HIGHBD_OBMCSADMXN(8, 8) +HIGHBD_OBMCSADMXN(8, 4) +HIGHBD_OBMCSADMXN(4, 8) +HIGHBD_OBMCSADMXN(4, 4) +/* clang-format on */ +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AV1 && CONFIG_MOTION_VAR diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h new file mode 100644 index 000000000..8f6509383 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V128_INTRINSICS_H +#define _V128_INTRINSICS_H + +#include +#include +#include +#include "./v128_intrinsics_c.h" +#include "./v64_intrinsics.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v128 v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); } +SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); } +SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); } +SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) { + return c_v128_from_64(hi, lo); +} +SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) { + return c_v128_from_v64(hi, lo); +} +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return c_v128_from_32(a, b, c, d); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { + return c_v128_load_unaligned(p); +} +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return c_v128_load_aligned(p); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { + c_v128_store_unaligned(p, a); +} +SIMD_INLINE void v128_store_aligned(void *p, v128 a) { + c_v128_store_aligned(p, a); +} + +SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { + return c_v128_align(a, b, c); +} + +SIMD_INLINE v128 v128_zero() { return c_v128_zero(); } +SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); } +SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); } +SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); } + +typedef uint32_t sad128_internal; +SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); } +SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { + return c_v128_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { + return c_v128_sad_u8_sum(s); +} +typedef uint32_t ssd128_internal; +SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); } +SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { + return c_v128_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { + return c_v128_ssd_u8_sum(s); +} +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + return c_v128_dotp_s16(a, b); +} +SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); } + +SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); } +SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); } +SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); } +SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); } + +SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); } +SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); } +SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); } +SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); } +SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); } +SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); } +SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); } +SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); } +SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); } +SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); } +SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); } +SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); } +SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); } +SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); } + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); } +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return c_v128_mullo_s16(a, b); +} +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { + return c_v128_mulhi_s16(a, b); +} +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { + return c_v128_mullo_s32(a, b); +} +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); } +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); } + +SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); } +SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); } +SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); } +SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); } +SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); } +SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); } +SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); } +SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); } +SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); } + +SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); } +SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); } +SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); } +SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); } +SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); } +SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); } +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); } +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); } +SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); } +SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); } +SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); } +SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { + return c_v128_unziplo_8(a, b); +} +SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { + return c_v128_unziphi_8(a, b); +} +SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { + return c_v128_unziplo_16(a, b); +} +SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { + return c_v128_unziphi_16(a, b); +} +SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { + return c_v128_unziplo_32(a, b); +} +SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { + return c_v128_unziphi_32(a, b); +} +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); } +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return c_v128_unpacklo_u8_s16(a); +} +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return c_v128_unpackhi_u8_s16(a); +} +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); } +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return c_v128_unpacklo_s8_s16(a); +} +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return c_v128_unpackhi_s8_s16(a); +} +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return c_v128_pack_s32_s16(a, b); +} +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return c_v128_pack_s16_u8(a, b); +} +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return c_v128_pack_s16_s8(a, b); +} +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); } +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); } +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return c_v128_unpacklo_u16_s32(a); +} +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return c_v128_unpacklo_s16_s32(a); +} +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return c_v128_unpackhi_u16_s32(a); +} +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return c_v128_unpackhi_s16_s32(a); +} +SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) { + return c_v128_shuffle_8(a, pattern); +} + +SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); } +SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); } +SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); } +SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { + return c_v128_cmpgt_s16(a, b); +} +SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { + return c_v128_cmplt_s16(a, b); +} +SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); } + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return c_v128_shl_8(a, c); +} +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return c_v128_shr_u8(a, c); +} +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + return c_v128_shr_s8(a, c); +} +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return c_v128_shl_16(a, c); +} +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return c_v128_shr_u16(a, c); +} +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return c_v128_shr_s16(a, c); +} +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return c_v128_shl_32(a, c); +} +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return c_v128_shr_u32(a, c); +} +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return c_v128_shr_s32(a, c); +} + +SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { + return c_v128_shr_n_byte(a, n); +} +SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { + return c_v128_shl_n_byte(a, n); +} +SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) { + return c_v128_shl_n_8(a, n); +} +SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) { + return c_v128_shl_n_16(a, n); +} +SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) { + return c_v128_shl_n_32(a, n); +} +SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) { + return c_v128_shr_n_u8(a, n); +} +SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) { + return c_v128_shr_n_u16(a, n); +} +SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) { + return c_v128_shr_n_u32(a, n); +} +SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) { + return c_v128_shr_n_s8(a, n); +} +SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) { + return c_v128_shr_n_s16(a, n); +} +SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) { + return c_v128_shr_n_s32(a, n); +} + +#endif /* _V128_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h new file mode 100644 index 000000000..0377d4ce1 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h @@ -0,0 +1,671 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V128_INTRINSICS_H +#define _V128_INTRINSICS_H + +#include +#include "./v64_intrinsics_arm.h" + +typedef int64x2_t v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { + return v64_low_u32(vget_low_s64(a)); +} + +SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); } + +SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); } + +SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); } + +SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { + return vcombine_s64((uint64x1_t)b, (uint64x1_t)a); +} + +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b)); +} + +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p)); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { + return v128_load_aligned(p); +} + +SIMD_INLINE void v128_store_aligned(void *p, v128 r) { + vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 r) { + vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); +} + +SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { +// The following functions require an immediate. +// Some compilers will check this during optimisation, others wont. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + return c ? vreinterpretq_s64_s8( + vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c)) + : b; +#else + return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c), + v64_align(v128_high_v64(b), v128_low_v64(b), c)) + : v128_from_v64( + v64_align(v128_high_v64(a), v128_low_v64(a), c - 8), + v64_align(v128_low_v64(a), v128_high_v64(b), c - 8)); +#endif +} + +SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); } + +SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); } + +SIMD_INLINE v128 v128_dup_8(uint8_t x) { + return vreinterpretq_s64_u8(vdupq_n_u8(x)); +} + +SIMD_INLINE v128 v128_dup_16(uint16_t x) { + return vreinterpretq_s64_u16(vdupq_n_u16(x)); +} + +SIMD_INLINE v128 v128_dup_32(uint32_t x) { + return vreinterpretq_s64_u32(vdupq_n_u32(x)); +} + +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) + + v64_dotp_s16(vget_low_s64(a), vget_low_s64(b)); +} + +SIMD_INLINE uint64_t v128_hadd_u8(v128 x) { + uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x)))); + return vget_lane_s32( + vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); +} + +SIMD_INLINE v128 v128_padd_s16(v128 a) { + return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a))); +} + +typedef struct { sad64_internal hi, lo; } sad128_internal; + +SIMD_INLINE sad128_internal v128_sad_u8_init() { + sad128_internal s; + s.hi = s.lo = vdupq_n_u16(0); + return s; +} + +/* Implementation dependent return value. Result must be finalised with + v128_sad_u8_sum(). + The result for more than 32 v128_sad_u8() calls is undefined. */ +SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { + sad128_internal r; + r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); + r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); + return r; +} + +SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { + return (uint32_t)(v64_sad_u8_sum(s.hi) + v64_sad_u8_sum(s.lo)); +} + +typedef struct { ssd64_internal hi, lo; } ssd128_internal; + +SIMD_INLINE ssd128_internal v128_ssd_u8_init() { + ssd128_internal s; + s.hi = s.lo = (ssd64_internal)(uint64_t)0; + return s; +} + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_u8_sum(). */ +SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { + ssd128_internal r; + r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); + r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); + return r; +} + +SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { + return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo)); +} + +SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); } + +SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); } + +SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); } + +SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); } + +SIMD_INLINE v128 v128_add_8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_add_16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_add_32(v128 x, v128 y) { + return vreinterpretq_s64_u32( + vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y))); +} + +SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) { + return vreinterpretq_s64_s32( + vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_abs_s16(v128 x) { + return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x))); +} + +SIMD_INLINE v128 v128_abs_s8(v128 x) { + return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x))); +} + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { + return vreinterpretq_s64_s32( + vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b))); +} + +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return vreinterpretq_s64_s16( + vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))); +} + +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { + return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)), + v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b))); +} + +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { + return vreinterpretq_s64_s32( + vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); +} + +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { + return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)), + v64_madd_s16(vget_low_s64(a), vget_low_s64(b))); +} + +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { + return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)), + v64_madd_us8(vget_low_s64(a), vget_low_s64(b))); +} + +SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); +} + +SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) { + return vreinterpretq_s64_s16( + vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) { + uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[0]); +} + +SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) { + uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[1]); +} + +SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) { + uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1])); +} + +SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) { + int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); + return vreinterpretq_s64_s16(r.val[0]); +} + +SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) { + int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); + return vreinterpretq_s64_s16(r.val[1]); +} + +SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) { + uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); + return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1])); +} + +SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) { + int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); + return vreinterpretq_s64_s32(r.val[0]); +} + +SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) { + int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); + return vreinterpretq_s64_s32(r.val[1]); +} + +SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) { + uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)); + return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1])); +} + +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { + return v128_from_v64(vget_low_u64((uint64x2_t)a), + vget_low_u64((uint64x2_t)b)); +} + +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { + return v128_from_v64(vget_high_u64((uint64x2_t)a), + vget_high_u64((uint64x2_t)b)); +} + +SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) { + uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[0]); +} + +SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) { + uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); + return vreinterpretq_s64_u8(r.val[1]); +} + +SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) { + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); + return vreinterpretq_s64_u16(r.val[0]); +} + +SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) { + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); + return vreinterpretq_s64_u16(r.val[1]); +} + +SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) { + uint32x4x2_t r = + vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); + return vreinterpretq_s64_u32(r.val[0]); +} + +SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) { + uint32x4x2_t r = + vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); + return vreinterpretq_s64_u32(r.val[1]); +} + +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { + return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a))); +} + +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { + return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a))); +} + +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))), + vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b)))); +} + +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))), + vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b)))); +} + +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))), + vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b)))); +} + +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { + return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a))); +} + +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { + return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a))); +} + +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return vreinterpretq_s64_u32( + vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return vreinterpretq_s64_s32( + vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return vreinterpretq_s64_u32( + vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return vreinterpretq_s64_s32( + vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a)))); +} + +SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { + return v128_from_64( + (uint64_t)vreinterpret_s64_u8( + vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)), + vget_high_u8(vreinterpretq_u8_s64(x)) } }, + vreinterpret_u8_s64(vget_high_s64(pattern)))), + (uint64_t)vreinterpret_s64_u8( + vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)), + vget_high_u8(vreinterpretq_u8_s64(x)) } }, + vreinterpret_u8_s64(vget_low_s64(pattern))))); +} + +SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + +SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); +} + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8( + vreinterpretq_u8_s64(a), vdupq_n_s8(c))); +} + +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8( + vreinterpretq_u8_s64(a), vdupq_n_s8(-c))); +} + +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + return (c > 7) ? v128_ones() : vreinterpretq_s64_s8(vshlq_s8( + vreinterpretq_s8_s64(a), vdupq_n_s8(-c))); +} + +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return (c > 15) ? v128_zero() + : vreinterpretq_s64_u16( + vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c))); +} + +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return (c > 15) ? v128_zero() + : vreinterpretq_s64_u16( + vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c))); +} + +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return (c > 15) ? v128_ones() + : vreinterpretq_s64_s16( + vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c))); +} + +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return (c > 31) ? v128_zero() + : vreinterpretq_s64_u32( + vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c))); +} + +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return (c > 31) ? v128_zero() + : vreinterpretq_s64_u32( + vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c))); +} + +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return (c > 31) ? v128_ones() + : vreinterpretq_s64_s32( + vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c))); +} + +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + +SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { + return n < 8 + ? v128_from_64( + (uint64_t)vorr_u64( + vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), + n * 8), + vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), + (8 - n) * 8)), + (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), + n * 8)) + : (n == 8 ? v128_from_64( + (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0) + : v128_from_64((uint64_t)vshl_n_u64( + vreinterpret_u64_s64(vget_low_s64(a)), + (n - 8) * 8), + 0)); +} + +SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { + return n < 8 + ? v128_from_64( + vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), n * 8), + vorr_u64( + vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8), + vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), + (8 - n) * 8))) + : (n == 8 + ? v128_from_64(0, vreinterpret_u64_s64(vget_high_s64(a))) + : v128_from_64( + 0, vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), + (n - 8) * 8))); +} + +SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { + return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { + return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { + return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)); +} + +SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { + return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { + return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { + return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c)); +} + +SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { + return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { + return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { + return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c)); +} + +#else + +SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { + if (n < 8) + return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n), + v64_shr_n_byte(v128_low_v64(a), 8 - n)), + v64_shl_n_byte(v128_low_v64(a), n)); + else + return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero()); +} + +SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { + if (n < 8) + return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n), + v64_or(v64_shr_n_byte(v128_low_v64(a), n), + v64_shl_n_byte(v128_high_v64(a), 8 - n))); + else + return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8)); +} + +SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { + return v128_shl_8(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { + return v128_shr_u8(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { + return v128_shr_s8(a, c); +} + +SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { + return v128_shl_16(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { + return v128_shr_u16(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { + return v128_shr_s16(a, c); +} + +SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { + return v128_shl_32(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { + return v128_shr_u32(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { + return v128_shr_s32(a, c); +} + +#endif + +#endif /* _V128_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h new file mode 100644 index 000000000..32e7c32de --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h @@ -0,0 +1,707 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V128_INTRINSICS_C_H +#define _V128_INTRINSICS_C_H + +#include +#include +#include "./v64_intrinsics_c.h" +#include "./aom_config.h" + +typedef union { + uint8_t u8[16]; + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + int8_t s8[16]; + int16_t s16[8]; + int32_t s32[4]; + int64_t s64[2]; + c_v64 v64[2]; +} c_v128; + +SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; } + +SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; } + +SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; } + +SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) { + c_v128 t; + t.u64[1] = hi; + t.u64[0] = lo; + return t; +} + +SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) { + c_v128 t; + t.v64[1] = hi; + t.v64[0] = lo; + return t; +} + +SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c, + uint32_t d) { + c_v128 t; + t.u32[3] = a; + t.u32[2] = b; + t.u32[1] = c; + t.u32[0] = d; + return t; +} + +SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) { + c_v128 t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 16; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 15) { + fprintf(stderr, "Error: unaligned v128 load at %p\n", p); + abort(); + } + return c_v128_load_unaligned(p); +} + +SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) { + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&a; + int c; + for (c = 0; c < 16; c++) pp[c] = q[c]; +} + +SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) { + if (SIMD_CHECK && (uintptr_t)p & 15) { + fprintf(stderr, "Error: unaligned v128 store at %p\n", p); + abort(); + } + c_v128_store_unaligned(p, a); +} + +SIMD_INLINE c_v128 c_v128_zero() { + c_v128 t; + t.u64[1] = t.u64[0] = 0; + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_8(x); + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_16(x); + return t; +} + +SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) { + c_v128 t; + t.v64[1] = t.v64[0] = c_v64_dup_32(x); + return t; +} + +SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) { + return c_v64_dotp_s16(a.v64[1], b.v64[1]) + + c_v64_dotp_s16(a.v64[0], b.v64[0]); +} + +SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) { + return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]); +} + +typedef uint32_t c_sad128_internal; + +SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + v128_sad_u8_sum(). + The result for more than 32 v128_sad_u8() calls is undefined. */ +SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a, + c_v128 b) { + int c; + for (c = 0; c < 16; c++) + s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + return s; +} + +SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; } + +typedef uint32_t c_ssd128_internal; + +SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_u8_sum(). */ +SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a, + c_v128 b) { + int c; + for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; } + +SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]), + c_v64_or(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]), + c_v64_xor(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]), + c_v64_and(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]), + c_v64_andn(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]), + c_v64_add_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]), + c_v64_add_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]), + c_v64_sadd_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]), + c_v64_add_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) { + c_v128 t; + t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; + t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; + t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; + t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; + return t; +} + +SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]), + c_v64_sub_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]), + c_v64_ssub_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]), + c_v64_ssub_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]), + c_v64_sub_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]), + c_v64_ssub_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]), + c_v64_ssub_u16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]), + c_v64_sub_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) { + return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) { + return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) { + c_v64 lo_bits = c_v64_mullo_s16(a, b); + c_v64 hi_bits = c_v64_mulhi_s16(a, b); + return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits), + c_v64_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]), + c_v64_mullo_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]), + c_v64_mulhi_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]), + c_v64_mullo_s32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]), + c_v64_madd_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]), + c_v64_madd_us8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]), + c_v64_avg_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]), + c_v64_rdavg_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]), + c_v64_avg_u16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]), + c_v64_min_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]), + c_v64_max_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]), + c_v64_min_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]), + c_v64_max_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]), + c_v64_min_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]), + c_v64_max_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]), + c_v64_ziplo_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]), + c_v64_ziplo_8(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]), + c_v64_ziplo_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]), + c_v64_ziplo_16(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]), + c_v64_ziplo_32(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]), + c_v64_ziplo_32(a.v64[1], b.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) { + return c_v128_from_v64(a.v64[0], b.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) { + return c_v128_from_v64(a.v64[1], b.v64[1]); +} + +SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b)); +} + +SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b)); +} + +SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) { + return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b)); +} + +SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u8[15] = b.u8[15]; + t.u8[14] = b.u8[13]; + t.u8[13] = b.u8[11]; + t.u8[12] = b.u8[9]; + t.u8[11] = b.u8[7]; + t.u8[10] = b.u8[5]; + t.u8[9] = b.u8[3]; + t.u8[8] = b.u8[1]; + t.u8[7] = a.u8[15]; + t.u8[6] = a.u8[13]; + t.u8[5] = a.u8[11]; + t.u8[4] = a.u8[9]; + t.u8[3] = a.u8[7]; + t.u8[2] = a.u8[5]; + t.u8[1] = a.u8[3]; + t.u8[0] = a.u8[1]; + } else { + t.u8[15] = a.u8[14]; + t.u8[14] = a.u8[12]; + t.u8[13] = a.u8[10]; + t.u8[12] = a.u8[8]; + t.u8[11] = a.u8[6]; + t.u8[10] = a.u8[4]; + t.u8[9] = a.u8[2]; + t.u8[8] = a.u8[0]; + t.u8[7] = b.u8[14]; + t.u8[6] = b.u8[12]; + t.u8[5] = b.u8[10]; + t.u8[4] = b.u8[8]; + t.u8[3] = b.u8[6]; + t.u8[2] = b.u8[4]; + t.u8[1] = b.u8[2]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1) + : _c_v128_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0) + : _c_v128_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u16[7] = b.u16[7]; + t.u16[6] = b.u16[5]; + t.u16[5] = b.u16[3]; + t.u16[4] = b.u16[1]; + t.u16[3] = a.u16[7]; + t.u16[2] = a.u16[5]; + t.u16[1] = a.u16[3]; + t.u16[0] = a.u16[1]; + } else { + t.u16[7] = a.u16[6]; + t.u16[6] = a.u16[4]; + t.u16[5] = a.u16[2]; + t.u16[4] = a.u16[0]; + t.u16[3] = b.u16[6]; + t.u16[2] = b.u16[4]; + t.u16[1] = b.u16[2]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1) + : _c_v128_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0) + : _c_v128_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) { + c_v128 t; + if (mode) { + t.u32[3] = b.u32[3]; + t.u32[2] = b.u32[1]; + t.u32[1] = a.u32[3]; + t.u32[0] = a.u32[1]; + } else { + t.u32[3] = a.u32[2]; + t.u32[2] = a.u32[0]; + t.u32[1] = b.u32[2]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1) + : _c_v128_unzip_32(a, b, 0); +} + +SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) { + return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0) + : _c_v128_unzip_32(b, a, 1); +} + +SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]), + c_v64_unpacklo_u8_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]), + c_v64_unpacklo_u8_s16(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]), + c_v64_unpacklo_s8_s16(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]), + c_v64_unpacklo_s8_s16(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]), + c_v64_pack_s32_s16(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]), + c_v64_pack_s16_u8(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]), + c_v64_pack_s16_s8(b.v64[1], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a)); +} + +SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a)); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]), + c_v64_unpacklo_u16_s32(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]), + c_v64_unpacklo_s16_s32(a.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]), + c_v64_unpacklo_u16_s32(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) { + return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]), + c_v64_unpacklo_s16_s32(a.v64[1])); +} + +SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) { + c_v128 t; + int c; + for (c = 0; c < 16; c++) { + if (pattern.u8[c] & ~15) { + fprintf(stderr, "Undefined v128_shuffle_8 index %d/%d\n", pattern.u8[c], + c); + abort(); + } + t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15) + : pattern.u8[c] & 15]; + } + return t; +} + +SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]), + c_v64_cmpgt_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]), + c_v64_cmplt_s8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]), + c_v64_cmpeq_8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]), + c_v64_cmpgt_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]), + c_v64_cmplt_s16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]), + c_v64_cmpeq_16(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, unsigned int n) { + if (n < 8) + return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n), + c_v64_shr_n_byte(a.v64[0], 8 - n)), + c_v64_shl_n_byte(a.v64[0], n)); + else + return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero()); +} + +SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, unsigned int n) { + if (n < 8) + return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n), + c_v64_or(c_v64_shr_n_byte(a.v64[0], n), + c_v64_shl_n_byte(a.v64[1], 8 - n))); + else + return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8)); +} + +SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, unsigned int c) { + if (SIMD_CHECK && c > 15) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c)) + : b; +} + +SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c), + c_v64_shr_u16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c), + c_v64_shr_s16(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c), + c_v64_shr_u32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, unsigned int c) { + return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c), + c_v64_shr_s32(a.v64[0], c)); +} + +SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, unsigned int n) { + return c_v128_shl_8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, unsigned int n) { + return c_v128_shl_16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, unsigned int n) { + return c_v128_shl_32(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, unsigned int n) { + return c_v128_shr_u8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, unsigned int n) { + return c_v128_shr_u16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, unsigned int n) { + return c_v128_shr_u32(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, unsigned int n) { + return c_v128_shr_s8(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, unsigned int n) { + return c_v128_shr_s16(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, unsigned int n) { + return c_v128_shr_s32(a, n); +} + +#endif /* _V128_INTRINSICS_C_H */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h new file mode 100644 index 000000000..cca1788d5 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V128_INTRINSICS_H +#define _V128_INTRINSICS_H + +#include "./v64_intrinsics_x86.h" + +typedef __m128i v128; + +SIMD_INLINE uint32_t v128_low_u32(v128 a) { + return (uint32_t)_mm_cvtsi128_si32(a); +} + +SIMD_INLINE v64 v128_low_v64(v128 a) { + return _mm_unpacklo_epi64(a, v64_zero()); +} + +SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); } + +SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { + return _mm_unpacklo_epi64(b, a); +} + +SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { + return v128_from_v64(v64_from_64(a), v64_from_64(b)); +} + +SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + return _mm_set_epi32(a, b, c, d); +} + +SIMD_INLINE v128 v128_load_aligned(const void *p) { + return _mm_load_si128((__m128i *)p); +} + +SIMD_INLINE v128 v128_load_unaligned(const void *p) { +#if defined(__SSSE3__) + return (__m128i)_mm_lddqu_si128((__m128i *)p); +#else + return _mm_loadu_si128((__m128i *)p); +#endif +} + +SIMD_INLINE void v128_store_aligned(void *p, v128 a) { + _mm_store_si128((__m128i *)p, a); +} + +SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { + _mm_storeu_si128((__m128i *)p, a); +} + +// The following function requires an immediate. +// Some compilers will check this during optimisation, others wont. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) +#if defined(__SSSE3__) +SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { + return c ? _mm_alignr_epi8(a, b, c) : b; +} +#else +#define v128_align(a, b, c) \ + ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) +#endif +#else +#if defined(__SSSE3__) +#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, c) : (b)) +#else +#define v128_align(a, b, c) \ + ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) +#endif +#endif + +SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); } + +SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); } + +SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); } + +SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); } + +SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); } + +SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); } + +SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); } + +SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); } + +SIMD_INLINE v128 v128_padd_s16(v128 a) { + return _mm_madd_epi16(a, _mm_set1_epi16(1)); +} + +SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); } + +SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); } + +SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); } + +SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); } + +SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); } + +SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); } + +SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); } + +SIMD_INLINE v128 v128_abs_s16(v128 a) { +#if defined(__SSSE3__) + return _mm_abs_epi16(a); +#else + return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); +#endif +} + +SIMD_INLINE v128 v128_abs_s8(v128 a) { +#if defined(__SSSE3__) + return _mm_abs_epi8(a); +#else + v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); + return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); +#endif +} + +SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { + return _mm_unpacklo_epi8(b, a); +} + +SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { + return _mm_unpackhi_epi8(b, a); +} + +SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { + return _mm_unpacklo_epi16(b, a); +} + +SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { + return _mm_unpackhi_epi16(b, a); +} + +SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { + return _mm_unpacklo_epi32(b, a); +} + +SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { + return _mm_unpackhi_epi32(b, a); +} + +SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { + return _mm_unpacklo_epi64(b, a); +} + +SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { + return _mm_unpackhi_epi64(b, a); +} + +SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } + +SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } + +SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } + +SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) { + return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8)); +} + +SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) { +#if defined(__SSSE3__) +#ifdef __x86_64__ + v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL); +#else + v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200); +#endif + return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), + _mm_shuffle_epi8(a, order)); +#else + return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); +#endif +} + +SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) { + return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)); +} + +SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) { +#if defined(__SSSE3__) +#ifdef __x86_64__ + v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL); +#else + v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100); +#endif + return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order), + _mm_shuffle_epi8(a, order)); +#else + return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); +#endif +} + +SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) { + return _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1))); +} + +SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) { + return _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0))); +} + +SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { + return _mm_unpackhi_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { + return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8); +} + +SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { + return _mm_packs_epi32(b, a); +} + +SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { + return _mm_packus_epi16(b, a); +} + +SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { + return _mm_packs_epi16(b, a); +} + +SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { + return _mm_unpackhi_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { + return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16); +} + +SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(x, pattern); +#else + v128 output; + unsigned char *input = (unsigned char *)&x; + unsigned char *index = (unsigned char *)&pattern; + char *selected = (char *)&output; + int counter; + + for (counter = 0; counter < 16; counter++) { + selected[counter] = input[index[counter] & 15]; + } + + return output; +#endif +} + +SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { + v128 r = _mm_madd_epi16(a, b); +#if defined(__SSE4_1__) && defined(__x86_64__) + v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r), + _mm_cvtepi32_epi64(_mm_srli_si128(r, 8))); + return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8))); +#else + return (int64_t)_mm_cvtsi128_si32(r) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); +#endif +} + +SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { + v128 t = _mm_sad_epu8(a, _mm_setzero_si128()); + return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t)); +} + +typedef v128 sad128_internal; + +SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + v128_sad_sum(). + The result for more than 32 v128_sad_u8() calls is undefined. */ +SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { + return _mm_add_epi64(s, _mm_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { + return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); +} + +typedef v128 ssd128_internal; + +SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_sum(). */ +SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { + v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), + _mm_unpacklo_epi8(b, _mm_setzero_si128())); + v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), + _mm_unpackhi_epi8(b, _mm_setzero_si128())); + v128 rl = _mm_madd_epi16(l, l); + v128 rh = _mm_madd_epi16(h, h); + v128 c = _mm_cvtsi32_si128(32); + rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 8)); + rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 4)); + rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 8)); + rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 4)); + return _mm_add_epi64( + s, _mm_srl_epi64(_mm_sll_epi64(_mm_unpacklo_epi64(rl, rh), c), c)); +} + +SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { + return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); +} + +SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); } + +SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); } + +SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); } + +SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); } + +SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { + v64 lo_bits = v64_mullo_s16(a, b); + v64 hi_bits = v64_mulhi_s16(a, b); + return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits), + v64_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { + return _mm_mullo_epi16(a, b); +} + +SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { + return _mm_mulhi_epi16(a, b); +} + +SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_mullo_epi32(a, b); +#else + return _mm_unpacklo_epi32( + _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8), + _mm_shuffle_epi32( + _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8)); +#endif +} + +SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); } + +SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { +#if defined(__SSSE3__) + return _mm_maddubs_epi16(a, b); +#else + return _mm_packs_epi32( + _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)), + _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8))); +#endif +} + +SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); } + +SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { + return _mm_sub_epi8(_mm_avg_epu8(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1))); +} + +SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); } + +SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); } + +SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); } + +SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_min_epi8(a, b); +#else + v128 mask = _mm_cmplt_epi8(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_max_epi8(a, b); +#else + v128 mask = _mm_cmplt_epi8(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); } + +SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); } + +SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); } + +SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) { + return _mm_cmpgt_epi16(a, b); +} + +SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { + return _mm_cmplt_epi16(a, b); +} + +SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); } + +SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)), + _mm_sll_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8(0xff >> c), + _mm_srl_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { + __m128i x = _mm_cvtsi32_si128(c + 8); + return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x), + _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x)); +} + +SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { + return _mm_sll_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { + return _mm_srl_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { + return _mm_sra_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { + return _mm_sll_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { + return _mm_srl_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { + return _mm_sra_epi32(a, _mm_cvtsi32_si128(c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v128_shl_n_byte(a, c) _mm_slli_si128(a, c) +#define v128_shr_n_byte(a, c) _mm_srli_si128(a, c) +#define v128_shl_n_8(a, c) \ + _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) +#define v128_shr_n_u8(a, c) \ + _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c)) +#define v128_shr_n_s8(a, c) \ + _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \ + _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8)) +#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c) +#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c) +#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c) +#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c) +#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c) +#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c) + +#endif /* _V128_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h new file mode 100644 index 000000000..1896374ee --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V256_INTRINSICS_H +#define _V256_INTRINSICS_H + +#include +#include +#include +#include "./v256_intrinsics_c.h" +#include "./v128_intrinsics.h" +#include "./v64_intrinsics.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v256 v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); } +SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); } +SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); } +SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); } +SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { + return c_v256_from_v128(hi, lo); +} +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return c_v256_from_64(a, b, c, d); +} +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return c_v256_from_v64(a, b, c, d); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return c_v256_load_unaligned(p); +} +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return c_v256_load_aligned(p); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + c_v256_store_unaligned(p, a); +} +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + c_v256_store_aligned(p, a); +} + +SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) { + return c_v256_align(a, b, c); +} + +SIMD_INLINE v256 v256_zero() { return c_v256_zero(); } +SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); } +SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); } +SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); } + +typedef uint32_t sad256_internal; +SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); } +SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { + return c_v256_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { + return c_v256_sad_u8_sum(s); +} +typedef uint32_t ssd256_internal; +SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); } +SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { + return c_v256_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { + return c_v256_ssd_u8_sum(s); +} +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + return c_v256_dotp_s16(a, b); +} +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); } + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); } +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); } +SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); } +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); } + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); } +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); } +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); } +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); } +SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); } +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); } +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); } +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); } +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); } +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); } +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); } +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); } +SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); } +SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); } + +SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); } +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return c_v256_mullo_s16(a, b); +} +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return c_v256_mulhi_s16(a, b); +} +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return c_v256_mullo_s32(a, b); +} +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); } +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); } + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); } +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); } +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); } +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); } +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); } +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); } +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); } +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); } +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); } + +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); } +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); } +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); } +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); } +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); } +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); } +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); } +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); } +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return c_v256_ziplo_128(a, b); +} +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return c_v256_ziphi_128(a, b); +} +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); } +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); } +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); } +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return c_v256_unziplo_8(a, b); +} +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return c_v256_unziphi_8(a, b); +} +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return c_v256_unziplo_16(a, b); +} +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return c_v256_unziphi_16(a, b); +} +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return c_v256_unziplo_32(a, b); +} +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return c_v256_unziphi_32(a, b); +} +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); } +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return c_v256_unpacklo_u8_s16(a); +} +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return c_v256_unpackhi_u8_s16(a); +} +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); } +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return c_v256_unpacklo_s8_s16(a); +} +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return c_v256_unpackhi_s8_s16(a); +} +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return c_v256_pack_s32_s16(a, b); +} +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return c_v256_pack_s16_u8(a, b); +} +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return c_v256_pack_s16_s8(a, b); +} +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return c_v256_unpack_u16_s32(a); +} +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return c_v256_unpack_s16_s32(a); +} +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return c_v256_unpacklo_u16_s32(a); +} +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return c_v256_unpacklo_s16_s32(a); +} +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return c_v256_unpackhi_u16_s32(a); +} +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return c_v256_unpackhi_s16_s32(a); +} +SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { + return c_v256_shuffle_8(a, pattern); +} +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return c_v256_pshuffle_8(a, pattern); +} + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); } +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); } +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); } +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return c_v256_cmpgt_s16(a, b); +} +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return c_v256_cmplt_s16(a, b); +} +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); } + +SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { + return c_v256_shl_8(a, c); +} +SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { + return c_v256_shr_u8(a, c); +} +SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { + return c_v256_shr_s8(a, c); +} +SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { + return c_v256_shl_16(a, c); +} +SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { + return c_v256_shr_u16(a, c); +} +SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { + return c_v256_shr_s16(a, c); +} +SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { + return c_v256_shl_32(a, c); +} +SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { + return c_v256_shr_u32(a, c); +} +SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { + return c_v256_shr_s32(a, c); +} + +SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) { + return c_v256_shr_n_byte(a, n); +} +SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) { + return c_v256_shl_n_byte(a, n); +} +SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) { + return c_v256_shl_n_8(a, n); +} +SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) { + return c_v256_shl_n_16(a, n); +} +SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) { + return c_v256_shl_n_32(a, n); +} +SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) { + return c_v256_shr_n_u8(a, n); +} +SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) { + return c_v256_shr_n_u16(a, n); +} +SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) { + return c_v256_shr_n_u32(a, n); +} +SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) { + return c_v256_shr_n_s8(a, n); +} +SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) { + return c_v256_shr_n_s16(a, n); +} +SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) { + return c_v256_shr_n_s32(a, n); +} + +#endif /* _V256_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h new file mode 100644 index 000000000..ba4ed719d --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V256_INTRINSICS_H +#define _V256_INTRINSICS_H + +#include "./v256_intrinsics_v128.h" + +#endif /* _V256_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h new file mode 100644 index 000000000..f96ca7fa6 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h @@ -0,0 +1,724 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V256_INTRINSICS_C_H +#define _V256_INTRINSICS_C_H + +#include +#include +#include "./v128_intrinsics_c.h" +#include "./aom_config.h" + +typedef union { + uint8_t u8[32]; + uint16_t u16[16]; + uint32_t u32[8]; + uint64_t u64[4]; + int8_t s8[32]; + int16_t s16[16]; + int32_t s32[8]; + int64_t s64[4]; + c_v64 v64[4]; + c_v128 v128[2]; +} c_v256; + +SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; } + +SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; } + +SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; } + +SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; } + +SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) { + c_v256 t; + t.v128[1] = hi; + t.v128[0] = lo; + return t; +} + +SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c, + uint64_t d) { + c_v256 t; + t.u64[3] = a; + t.u64[2] = b; + t.u64[1] = c; + t.u64[0] = d; + return t; +} + +SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) { + c_v256 t; + t.u64[3] = a.u64; + t.u64[2] = b.u64; + t.u64[1] = c.u64; + t.u64[0] = d.u64; + return t; +} + +SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) { + c_v256 t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 32; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 31) { + fprintf(stderr, "Error: unaligned v256 load at %p\n", p); + abort(); + } + return c_v256_load_unaligned(p); +} + +SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) { + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&a; + int c; + for (c = 0; c < 32; c++) pp[c] = q[c]; +} + +SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) { + if (SIMD_CHECK && (uintptr_t)p & 31) { + fprintf(stderr, "Error: unaligned v256 store at %p\n", p); + abort(); + } + c_v256_store_unaligned(p, a); +} + +SIMD_INLINE c_v256 c_v256_zero() { + c_v256 t; + t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0; + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x); + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x); + return t; +} + +SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) { + c_v256 t; + t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x); + return t; +} + +SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) { + return c_v128_dotp_s16(a.v128[1], b.v128[1]) + + c_v128_dotp_s16(a.v128[0], b.v128[0]); +} + +SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) { + return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]); +} + +typedef uint32_t c_sad256_internal; + +SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u8_sum(). + The result for more than 16 v256_sad_u8() calls is undefined. */ +SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a, + c_v256 b) { + int c; + for (c = 0; c < 32; c++) + s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + return s; +} + +SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; } + +typedef uint32_t c_ssd256_internal; + +SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_u8_sum(). */ +SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a, + c_v256 b) { + int c; + for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; } + +SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]), + c_v128_or(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]), + c_v128_xor(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]), + c_v128_and(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]), + c_v128_andn(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]), + c_v128_add_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]), + c_v128_add_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]), + c_v128_sadd_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]), + c_v128_add_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) { + c_v256 t; + t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; + t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3]; + t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5]; + t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7]; + t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9]; + t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11]; + t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13]; + t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15]; + return t; +} + +SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]), + c_v128_sub_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]), + c_v128_ssub_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]), + c_v128_ssub_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]), + c_v128_sub_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]), + c_v128_ssub_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]), + c_v128_ssub_u16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]), + c_v128_sub_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) { + return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) { + return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) { + c_v128 lo_bits = c_v128_mullo_s16(a, b); + c_v128 hi_bits = c_v128_mulhi_s16(a, b); + return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits), + c_v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]), + c_v128_mullo_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]), + c_v128_mulhi_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]), + c_v128_mullo_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]), + c_v128_madd_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]), + c_v128_madd_us8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]), + c_v128_avg_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]), + c_v128_rdavg_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]), + c_v128_avg_u16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]), + c_v128_min_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]), + c_v128_max_u8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]), + c_v128_min_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]), + c_v128_max_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]), + c_v128_min_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]), + c_v128_max_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]), + c_v128_ziplo_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]), + c_v128_ziplo_8(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]), + c_v128_ziplo_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]), + c_v128_ziplo_16(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]), + c_v128_ziplo_32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]), + c_v128_ziplo_32(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]), + c_v128_ziplo_64(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]), + c_v128_ziplo_64(a.v128[1], b.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) { + return c_v256_from_v128(a.v128[0], b.v128[0]); +} + +SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) { + return c_v256_from_v128(a.v128[1], b.v128[1]); +} + +SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b)); +} + +SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b)); +} + +SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) { + return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b)); +} + +SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) { + c_v256 t; + int i; + if (mode) { + for (i = 0; i < 16; i++) { + t.u8[i] = a.u8[i * 2 + 1]; + t.u8[i + 16] = b.u8[i * 2 + 1]; + } + } else { + for (i = 0; i < 16; i++) { + t.u8[i] = b.u8[i * 2]; + t.u8[i + 16] = a.u8[i * 2]; + } + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1) + : _c_v256_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0) + : _c_v256_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) { + c_v256 t; + int i; + if (mode) { + for (i = 0; i < 8; i++) { + t.u16[i] = a.u16[i * 2 + 1]; + t.u16[i + 8] = b.u16[i * 2 + 1]; + } + } else { + for (i = 0; i < 8; i++) { + t.u16[i] = b.u16[i * 2]; + t.u16[i + 8] = a.u16[i * 2]; + } + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1) + : _c_v256_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0) + : _c_v256_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) { + c_v256 t; + if (mode) { + t.u32[7] = b.u32[7]; + t.u32[6] = b.u32[5]; + t.u32[5] = b.u32[3]; + t.u32[4] = b.u32[1]; + t.u32[3] = a.u32[7]; + t.u32[2] = a.u32[5]; + t.u32[1] = a.u32[3]; + t.u32[0] = a.u32[1]; + } else { + t.u32[7] = a.u32[6]; + t.u32[6] = a.u32[4]; + t.u32[5] = a.u32[2]; + t.u32[4] = a.u32[0]; + t.u32[3] = b.u32[6]; + t.u32[2] = b.u32[4]; + t.u32[1] = b.u32[2]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1) + : _c_v256_unzip_32(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0) + : _c_v256_unzip_32(b, a, 1); +} + +SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]), + c_v128_unpacklo_u8_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]), + c_v128_unpacklo_u8_s16(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]), + c_v128_unpacklo_s8_s16(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]), + c_v128_unpacklo_s8_s16(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]), + c_v128_pack_s32_s16(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]), + c_v128_pack_s16_u8(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]), + c_v128_pack_s16_s8(b.v128[1], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a), + c_v128_unpacklo_u16_s32(a)); +} + +SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a), + c_v128_unpacklo_s16_s32(a)); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]), + c_v128_unpacklo_u16_s32(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]), + c_v128_unpacklo_s16_s32(a.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]), + c_v128_unpacklo_u16_s32(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) { + return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]), + c_v128_unpacklo_s16_s32(a.v128[1])); +} + +SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) { + c_v256 t; + int c; + for (c = 0; c < 32; c++) { + if (pattern.u8[c] & ~31) { + fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c], + c); + abort(); + } + t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) + : pattern.u8[c] & 31]; + } + return t; +} + +// Pairwise / dual-lane shuffle: shuffle two 128 bit lates. +SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) { + return c_v256_from_v128( + c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)), + c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern))); +} + +SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]), + c_v128_cmpgt_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]), + c_v128_cmplt_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]), + c_v128_cmpeq_8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]), + c_v128_cmpgt_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]), + c_v128_cmplt_s16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]), + c_v128_cmpeq_16(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) { + if (n < 16) + return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n), + c_v128_shr_n_byte(a.v128[0], 16 - n)), + c_v128_shl_n_byte(a.v128[0], n)); + else if (n > 16) + return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16), + c_v128_zero()); + else + return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero()); +} + +SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) { + if (n < 16) + return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n), + c_v128_or(c_v128_shr_n_byte(a.v128[0], n), + c_v128_shl_n_byte(a.v128[1], 16 - n))); + else if (n > 16) + return c_v256_from_v128(c_v128_zero(), + c_v128_shr_n_byte(a.v128[1], n - 16)); + else + return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a)); +} + +SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) { + if (SIMD_CHECK && c > 31) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c)) + : b; +} + +SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_8(a.v128[1], c), + c_v128_shl_8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c), + c_v128_shr_u8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c), + c_v128_shr_s8(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_16(a.v128[1], c), + c_v128_shl_16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c), + c_v128_shr_u16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c), + c_v128_shr_s16(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shl_32(a.v128[1], c), + c_v128_shl_32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c), + c_v128_shr_u32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) { + return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c), + c_v128_shr_s32(a.v128[0], c)); +} + +SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) { + return c_v256_shl_8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) { + return c_v256_shl_16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) { + return c_v256_shl_32(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) { + return c_v256_shr_u8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) { + return c_v256_shr_u16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) { + return c_v256_shr_u32(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) { + return c_v256_shr_s8(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) { + return c_v256_shr_s16(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) { + return c_v256_shr_s32(a, n); +} + +#endif /* _V256_INTRINSICS_C_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h new file mode 100644 index 000000000..a4b334ea6 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V256_INTRINSICS_V128_H +#define _V256_INTRINSICS_V128_H + +#if HAVE_NEON +#include "./v128_intrinsics_arm.h" +#elif HAVE_SSE2 +#include "./v128_intrinsics_x86.h" +#else +#include "./v128_intrinsics.h" +#endif + +typedef struct { v128 lo, hi; } v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); } + +SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); } + +SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; } + +SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; } + +SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { + v256 t; + t.hi = hi; + t.lo = lo; + return t; +} + +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); +} + +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16), + v128_load_unaligned(p)); +} + +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return v256_from_v128(v128_load_aligned((uint8_t *)p + 16), + v128_load_aligned(p)); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + v128_store_unaligned(p, a.lo); + v128_store_unaligned((uint8_t *)p + 16, a.hi); +} + +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + v128_store_aligned(p, a.lo); + v128_store_aligned((uint8_t *)p + 16, a.hi); +} + +SIMD_INLINE v256 v256_zero() { + return v256_from_v128(v128_zero(), v128_zero()); +} + +SIMD_INLINE v256 v256_dup_8(uint8_t x) { + v128 t = v128_dup_8(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE v256 v256_dup_16(uint16_t x) { + v128 t = v128_dup_16(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE v256 v256_dup_32(uint32_t x) { + v128 t = v128_dup_32(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo); +} + +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { + return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo); +} + +typedef struct { + sad128_internal hi; + sad128_internal lo; +} sad256_internal; + +SIMD_INLINE sad256_internal v256_sad_u8_init() { + sad256_internal t; + t.hi = v128_sad_u8_init(); + t.lo = v128_sad_u8_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u8_sum(). + The result for more than 16 v256_sad_u8() calls is undefined. */ +SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { + sad256_internal t; + t.hi = v128_sad_u8(s.hi, a.hi, b.hi); + t.lo = v128_sad_u8(s.lo, a.lo, b.lo); + return t; +} + +SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { + return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo); +} + +typedef struct { + ssd128_internal hi; + ssd128_internal lo; +} ssd256_internal; + +SIMD_INLINE ssd256_internal v256_ssd_u8_init() { + ssd256_internal t; + t.hi = v128_ssd_u8_init(); + t.lo = v128_ssd_u8_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_u8_sum(). */ +SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { + ssd256_internal t; + t.hi = v128_ssd_u8(s.hi, a.hi, b.hi); + t.lo = v128_ssd_u8(s.lo, a.lo, b.lo); + return t; +} + +SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { + return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo); +} + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { + return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { + return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_and(v256 a, v256 b) { + return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { + return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { + return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { + return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { + return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { + return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_padd_s16(v256 a) { + return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo)); +} + +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { + return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { + return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { + return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { + return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { + return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { + return v256_from_v128(v128_ssub_u16(a.hi, b.hi), v128_ssub_u16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { + return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_abs_s16(v256 a) { + return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo)); +} + +SIMD_INLINE v256 v256_abs_s8(v256 a) { + return v256_from_v128(v128_abs_s8(a.hi), v128_abs_s8(a.lo)); +} + +SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { + v128 lo_bits = v128_mullo_s16(a, b); + v128 hi_bits = v128_mulhi_s16(a, b); + return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), + v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { + return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { + return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { + return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { + return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { + return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { + return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { + return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { + return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { + return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { + return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { + return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi)); +} + +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi)); +} + +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi)); +} + +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi)); +} + +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return v256_from_v128(a.lo, b.lo); +} + +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return v256_from_v128(a.hi, b.hi); +} + +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); +} + +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); +} + +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); +} + +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_16(a.hi, a.lo), + v128_unziplo_16(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_16(a.hi, a.lo), + v128_unziphi_16(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_32(a.hi, a.lo), + v128_unziplo_32(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_32(a.hi, a.lo), + v128_unziphi_32(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo)); +} + +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi)); +} + +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a.lo), v128_unpacklo_s8_s16(a.lo)); +} + +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a.hi), v128_unpacklo_s8_s16(a.hi)); +} + +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo), + v128_pack_s32_s16(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo), + v128_pack_s16_u8(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo), + v128_pack_s16_s8(b.hi, b.lo)); +} + +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); +} + +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a.lo), + v128_unpacklo_u16_s32(a.lo)); +} + +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a.lo), + v128_unpacklo_s16_s32(a.lo)); +} + +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a.hi), + v128_unpacklo_u16_s32(a.hi)); +} + +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a.hi), + v128_unpacklo_s16_s32(a.hi)); +} + +SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { + v128 c16 = v128_dup_8(16); + v128 maskhi = v128_cmplt_s8(pattern.hi, c16); + v128 masklo = v128_cmplt_s8(pattern.lo, c16); + return v256_from_v128( + v128_or( + v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi), + v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)), + v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo), + v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)), + masklo))); +} + +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return v256_from_v128( + v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)), + v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern))); +} + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo)); +} + +SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { + return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c)); +} + +SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { + return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c)); +} + +SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { + return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c)); +} + +SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { + return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v256_shl_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n), \ + v128_shr_n_byte(a.lo, 16 - (n))), \ + v128_shl_n_byte(a.lo, (n))) \ + : v256_from_v128((n) > 16 ? v128_shl_n_byte(a.lo, (n)-16) : a.lo, \ + v128_zero())) + +#define v256_shr_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n), \ + v128_or(v128_shr_n_byte(a.lo, n), \ + v128_shl_n_byte(a.hi, 16 - (n)))) \ + : v256_from_v128(v128_zero(), \ + (n) > 16 ? v128_shr_n_byte(a.hi, (n)-16) : a.hi)) + +#define v256_align(a, b, c) \ + ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) + +#define v256_shl_n_8(a, n) \ + v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n)) +#define v256_shl_n_16(a, n) \ + v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n)) +#define v256_shl_n_32(a, n) \ + v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n)) +#define v256_shr_n_u8(a, n) \ + v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n)) +#define v256_shr_n_u16(a, n) \ + v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n)) +#define v256_shr_n_u32(a, n) \ + v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n)) +#define v256_shr_n_s8(a, n) \ + v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n)) +#define v256_shr_n_s16(a, n) \ + v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n)) +#define v256_shr_n_s32(a, n) \ + v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n)) + +#endif /* _V256_INTRINSICS_V128_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h new file mode 100644 index 000000000..b82daab68 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h @@ -0,0 +1,548 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V256_INTRINSICS_H +#define _V256_INTRINSICS_H + +#if !defined(__AVX2__) + +#include "./v256_intrinsics_v128.h" + +#else + +// The _m256i type seems to cause problems for g++'s mangling prior to +// version 5, but adding -fabi-version=0 fixes this. +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \ + defined(__AVX2__) && defined(__cplusplus) +#pragma GCC optimize "-fabi-version=0" +#endif + +#include +#include "./v128_intrinsics_x86.h" + +typedef __m256i v256; + +SIMD_INLINE uint32_t v256_low_u32(v256 a) { + return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0)); +} + +SIMD_INLINE v64 v256_low_v64(v256 a) { + return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero()); +} + +SIMD_INLINE v128 v256_low_v128(v256 a) { + return _mm256_extracti128_si256(a, 0); +} + +SIMD_INLINE v128 v256_high_v128(v256 a) { + return _mm256_extracti128_si256(a, 1); +} + +SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) { + // gcc seems to be missing _mm256_set_m128i() + return _mm256_insertf128_si256( + _mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1); +} + +SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { + return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d)); +} + +SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { + return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d)); +} + +SIMD_INLINE v256 v256_load_aligned(const void *p) { + return _mm256_load_si256((const __m256i *)p); +} + +SIMD_INLINE v256 v256_load_unaligned(const void *p) { + return _mm256_loadu_si256((const __m256i *)p); +} + +SIMD_INLINE void v256_store_aligned(void *p, v256 a) { + _mm256_store_si256((__m256i *)p, a); +} + +SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { + _mm256_storeu_si256((__m256i *)p, a); +} + +SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); } + +SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); } + +SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); } + +SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); } + +SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); } + +SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); } + +SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { + return _mm256_adds_epi16(a, b); +} + +SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); } + +SIMD_INLINE v256 v256_padd_s16(v256 a) { + return _mm256_madd_epi16(a, _mm256_set1_epi16(1)); +} + +SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); } + +SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); } + +SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); } + +SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); } + +SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { + return _mm256_subs_epi16(a, b); +} + +SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { + return _mm256_subs_epu16(a, b); +} + +SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); } + +SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); } + +SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); } + +// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit +// lanes of lower or upper halves of a 256bit vector because the +// unpack/pack intrinsics operate on the 256 bit input vector as 2 +// independent 128 bit vectors. +SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)), + v128_ziplo_8(v256_low_v128(a), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)), + v128_ziplo_8(v256_high_v128(a), v256_high_v128(b))); +} + +SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)), + v128_ziplo_16(v256_low_v128(a), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)), + v128_ziplo_16(v256_high_v128(a), v256_high_v128(b))); +} + +SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)), + v128_ziplo_32(v256_low_v128(a), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)), + v128_ziplo_32(v256_high_v128(a), v256_high_v128(b))); +} + +SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)), + v128_ziplo_64(v256_low_v128(a), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { + return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)), + v128_ziplo_64(v256_high_v128(a), v256_high_v128(b))); +} + +SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { + return v256_from_v128(v256_low_v128(a), v256_low_v128(b)); +} + +SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { + return v256_from_v128(v256_high_v128(a), v256_high_v128(b)); +} + +SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b)); +} + +SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b)); +} + +SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { + return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); +} + +SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)), + v128_unziplo_8(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)), + v128_unziphi_8(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)), + v128_unziplo_16(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)), + v128_unziphi_16(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { + return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)), + v128_unziplo_32(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)), + v128_unziphi_32(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)), + v128_unpacklo_u8_s16(v256_low_v128(a))); +} + +SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)), + v128_unpacklo_u8_s16(v256_high_v128(a))); +} + +SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { + return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a)); +} + +SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(v256_low_v128(a)), + v128_unpacklo_s8_s16(v256_low_v128(a))); +} + +SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { + return v256_from_v128(v128_unpackhi_s8_s16(v256_high_v128(a)), + v128_unpacklo_s8_s16(v256_high_v128(a))); +} + +SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { + return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)), + v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)), + v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { + return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)), + v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b))); +} + +SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a)); +} + +SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { + return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a)); +} + +SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)), + v128_unpacklo_u16_s32(v256_low_v128(a))); +} + +SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)), + v128_unpacklo_s16_s32(v256_low_v128(a))); +} + +SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)), + v128_unpacklo_u16_s32(v256_high_v128(a))); +} + +SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { + return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)), + v128_unpacklo_s16_s32(v256_high_v128(a))); +} +SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { + v128 c16 = v128_dup_8(16); + v128 hi = v256_high_v128(pattern); + v128 lo = v256_low_v128(pattern); + v128 maskhi = v128_cmplt_s8(hi, c16); + v128 masklo = v128_cmplt_s8(lo, c16); + return v256_from_v128( + v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi), + v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)), + maskhi)), + v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo), + v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)), + masklo))); +} + +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return _mm256_shuffle_epi8(a, pattern); +} + +SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { + v256 r = _mm256_madd_epi16(a, b); +#if defined(__x86_64__) + v128 t; + r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), + _mm256_cvtepi32_epi64(v256_low_v128(r))); + t = v256_low_v128(_mm256_add_epi64( + r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); + return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); +#else + v128 l = v256_low_v128(r); + v128 h = v256_high_v128(r); + return (int64_t)_mm_cvtsi128_si32(l) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + + (int64_t)_mm_cvtsi128_si32(h) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); +#endif +} + +SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { + v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256()); + v128 lo = v256_low_v128(t); + v128 hi = v256_high_v128(t); + lo = v128_add_32(lo, hi); + return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo)); +} + +typedef v256 sad256_internal; + +SIMD_INLINE sad256_internal v256_sad_u8_init() { + return _mm256_setzero_si256(); +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_sum(). + The result for more than 32 v256_sad_u8() calls is undefined. */ +SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { + return _mm256_add_epi64(s, _mm256_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { + v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); + return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); +} + +typedef v256 ssd256_internal; + +SIMD_INLINE ssd256_internal v256_ssd_u8_init() { + return _mm256_setzero_si256(); +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_sum(). */ +SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { + v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), + _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); + v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()), + _mm256_unpackhi_epi8(b, _mm256_setzero_si256())); + v256 rl = _mm256_madd_epi16(l, l); + v256 rh = _mm256_madd_epi16(h, h); + v128 c = _mm_cvtsi32_si128(32); + rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8)); + rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4)); + rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8)); + rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4)); + return _mm256_add_epi64( + s, + _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c)); +} + +SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { + v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s)); + return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t))); +} + +SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); } + +SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); } + +SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); } + +SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); } + +SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) { + v128 lo_bits = v128_mullo_s16(a, b); + v128 hi_bits = v128_mulhi_s16(a, b); + return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits), + v128_ziplo_16(hi_bits, lo_bits)); +} + +SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { + return _mm256_mullo_epi16(a, b); +} + +SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { + return _mm256_mulhi_epi16(a, b); +} + +SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { + return _mm256_mullo_epi32(a, b); +} + +SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { + return _mm256_madd_epi16(a, b); +} + +SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { + return _mm256_maddubs_epi16(a, b); +} + +SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); } + +SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { + return _mm256_sub_epi8( + _mm256_avg_epu8(a, b), + _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1))); +} + +SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); } + +SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); } + +SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); } + +SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); } + +SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); } + +SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); } + +SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); } + +SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { + return _mm256_cmpgt_epi8(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { + return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a)); +} + +SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { + return _mm256_cmpeq_epi8(a, b); +} + +SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { + return _mm256_cmpgt_epi16(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { + return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a)); +} + +SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { + return _mm256_cmpeq_epi16(a, b); +} + +SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { + return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)), + _mm256_sll_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { + return _mm256_and_si256(_mm256_set1_epi8(0xff >> c), + _mm256_srl_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { + __m128i x = _mm_cvtsi32_si128(c + 8); + return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x), + _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x)); +} + +SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { + return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { + return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { + return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { + return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { + return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { + return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +// _mm256_slli_si256 works on 128 bit lanes and can't be used +#define v256_shl_n_byte(a, n) \ + ((n) < 16 \ + ? v256_from_v128(v128_or(v128_shl_n_byte(v256_high_v128(a), n), \ + v128_shr_n_byte(v256_low_v128(a), 16 - (n))), \ + v128_shl_n_byte(v256_low_v128(a), n)) \ + : v256_from_v128(v128_shl_n_byte(v256_low_v128(a), (n)-16), \ + v128_zero())) + +// _mm256_srli_si256 works on 128 bit lanes and can't be used +#define v256_shr_n_byte(a, n) \ + ((n) < 16 \ + ? _mm256_alignr_epi8( \ + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \ + : ((n) > 16 \ + ? _mm256_srli_si256( \ + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), \ + (n)-16) \ + : _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)))) + +// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used +#define v256_align(a, b, c) \ + ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b) + +#define v256_shl_n_8(a, c) \ + _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \ + _mm256_slli_epi16(a, c)) +#define v256_shr_n_u8(a, c) \ + _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c)) +#define v256_shr_n_s8(a, c) \ + _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \ + _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8)) +#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c) +#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c) +#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c) +#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c) +#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c) +#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c) +#endif + +#endif /* _V256_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h new file mode 100644 index 000000000..ee2b683a4 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V64_INTRINSICS_H +#define _V64_INTRINSICS_H + +#include +#include +#include "./v64_intrinsics_c.h" + +/* Fallback to plain, unoptimised C. */ + +typedef c_v64 v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); } +SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); } +SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); } +SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); } +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return c_v64_from_32(x, y); +} +SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); } +SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); } +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return c_v64_from_16(a, b, c, d); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return c_u32_load_unaligned(p); +} +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return c_u32_load_aligned(p); +} +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { + c_u32_store_unaligned(p, a); +} +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + c_u32_store_aligned(p, a); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return c_v64_load_unaligned(p); +} +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return c_v64_load_aligned(p); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { + c_v64_store_unaligned(p, a); +} +SIMD_INLINE void v64_store_aligned(void *p, v64 a) { + c_v64_store_aligned(p, a); +} + +SIMD_INLINE v64 v64_align(v64 a, v64 b, c) { return c_v64_align(a, b, c); } + +SIMD_INLINE v64 v64_zero() { return c_v64_zero(); } +SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); } +SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); } +SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); } + +SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); } +SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); } +SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); } +SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); } +SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); } +SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); } +SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); } +SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); } +SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); } +SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); } +SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); } +SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); } +SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); } + +SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); } +SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); } +SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); } +SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); } +SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); } +SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); } +SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); } +SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); } +SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); } +SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); } +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); } +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); } +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); } +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); } +SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { + return c_v64_pack_s32_s16(a, b); +} +SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { + return c_v64_pack_s16_u8(a, b); +} +SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { + return c_v64_pack_s16_s8(a, b); +} +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { + return c_v64_unpacklo_u16_s32(a); +} +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { + return c_v64_unpacklo_s16_s32(a); +} +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { + return c_v64_unpackhi_u16_s32(a); +} +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { + return c_v64_unpackhi_s16_s32(a); +} +SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) { + return c_v64_shuffle_8(a, pattern); +} + +typedef uint32_t sad64_internal; +SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); } +SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { + return c_v64_sad_u8(s, a, b); +} +SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { + return c_v64_sad_u8_sum(s); +} +typedef uint32_t ssd64_internal; +SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); } +SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { + return c_v64_ssd_u8(s, a, b); +} +SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { + return c_v64_ssd_u8_sum(s); +} +SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); } +SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); } +SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); } +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); } + +SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); } +SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); } +SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); } +SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); } + +SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); } +SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); } +SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); } +SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); } +SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); } + +SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); } +SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); } +SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); } +SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); } +SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); } +SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); } +SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); } +SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); } +SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); } +SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); } +SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); } +SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); } +SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); } +SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); } + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); } +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); } +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); } +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); } +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) { + return c_v64_shr_u16(a, n); +} +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) { + return c_v64_shr_s16(a, n); +} +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); } +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) { + return c_v64_shr_u32(a, n); +} +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) { + return c_v64_shr_s32(a, n); +} +SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) { + return c_v64_shr_n_byte(a, n); +} +SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) { + return c_v64_shl_n_byte(a, n); +} +SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { + return c_v64_shl_n_8(a, c); +} +SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { + return c_v64_shr_n_u8(a, c); +} +SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { + return c_v64_shr_n_s8(a, c); +} +SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { + return c_v64_shl_n_16(a, c); +} +SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { + return c_v64_shr_n_u16(a, c); +} +SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { + return c_v64_shr_n_s16(a, c); +} +SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { + return c_v64_shl_n_32(a, c); +} +SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { + return c_v64_shr_n_u32(a, c); +} +SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { + return c_v64_shr_n_s32(a, c); +} + +#endif /* _V64_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h new file mode 100644 index 000000000..c7574eef5 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V64_INTRINSICS_H +#define _V64_INTRINSICS_H + +#include +#include "./v64_intrinsics_arm.h" +#include "aom_ports/arm.h" + +#ifdef AOM_INCOMPATIBLE_GCC +#error Incompatible gcc +#endif + +typedef int64x1_t v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { + return vget_lane_u32(vreinterpret_u32_s64(a), 0); +} + +SIMD_INLINE uint32_t v64_high_u32(v64 a) { + return vget_lane_u32(vreinterpret_u32_s64(a), 1); +} + +SIMD_INLINE int32_t v64_low_s32(v64 a) { + return vget_lane_s32(vreinterpret_s32_s64(a), 0); +} + +SIMD_INLINE int32_t v64_high_s32(v64 a) { + return vget_lane_s32(vreinterpret_s32_s64(a), 1); +} + +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 | + d); +} + +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return vcreate_s64((uint64_t)x << 32 | y); +} + +SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); } + +SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; } + +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0); +} + +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { +#if defined(__clang__) + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), + 0); +#elif defined(__CC_ARM) + *(__packed uint32_t *)p) = a; +#elif defined(__GNUC__) + *((__attribute((packed)) uint32_t *)p) = a; +#else + vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a), + 0); +#endif +} + +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p)); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return v64_load_aligned(p); +} + +SIMD_INLINE void v64_store_aligned(void *p, v64 r) { + vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 r) { + vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); +} + +// The following function requires an immediate. +// Some compilers will check this if it's optimising, others wont. +SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + return c ? vreinterpret_s64_s8( + vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c)) + : b; +#else + return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8)) + : b; +#endif +} + +SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); } + +SIMD_INLINE v64 v64_dup_8(uint8_t x) { + return vreinterpret_s64_u8(vdup_n_u8(x)); +} + +SIMD_INLINE v64 v64_dup_16(uint16_t x) { + return vreinterpret_s64_u16(vdup_n_u16(x)); +} + +SIMD_INLINE v64 v64_dup_32(uint32_t x) { + return vreinterpret_s64_u32(vdup_n_u32(x)); +} + +SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) { + int64x2_t r = vpaddlq_s32(vpaddlq_s16( + vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)), + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))))); + return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r)); +} + +SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) { + int64x2_t r = + vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); + return (int64_t)(vget_high_s64(r) + vget_low_s64(r)); +} + +SIMD_INLINE uint64_t v64_hadd_u8(v64 x) { + return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))); +} + +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { + return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))); +} + +typedef uint16x8_t sad64_internal; + +SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); } + +/* Implementation dependent return value. Result must be finalised with + v64_sad_u8_sum(). + The result for more than 32 v64_sad_u8() calls is undefined. */ +SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { + return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); +} + +SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { + uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s)); + return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r)); +} + +typedef int64x1_t ssd64_internal; + +SIMD_INLINE ssd64_internal v64_ssd_u8_init() { + return (ssd64_internal)(uint64_t)0; +} + +/* Implementation dependent return value. Result must be finalised with + * v64_ssd_u8_sum(). */ +SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { + uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); + uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t))); + return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r))); +} + +SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { + return (uint32_t)(uint64_t)s; +} + +SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); } + +SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); } + +SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); } + +SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); } + +SIMD_INLINE v64 v64_add_8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_add_16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_add_32(v64 x, v64 y) { + return vreinterpret_s64_u32( + vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y))); +} + +SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) { + return vreinterpret_s64_s32( + vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); +} + +SIMD_INLINE v64 v64_abs_s16(v64 x) { + return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x))); +} + +SIMD_INLINE v64 v64_abs_s8(v64 x) { + return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x))); +} + +SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) { + return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32( + vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16))); +} + +SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) { + return vreinterpret_s64_s32( + vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); +} + +SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) { + int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)); + return vreinterpret_s64_s32( + vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))), + vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t))))); +} + +SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) { + return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16( + vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)), + vreinterpret_s8_s64(y)), + vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7))))); +} + +SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); +} + +SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) { + return vreinterpret_s64_s16( + vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) { + uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[0]); +} + +SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) { + uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[1]); +} + +SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) { + int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); + return vreinterpret_s64_s16(r.val[0]); +} + +SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) { + int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); + return vreinterpret_s64_s16(r.val[1]); +} + +SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) { + int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); + return vreinterpret_s64_s32(r.val[0]); +} + +SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) { + int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); + return vreinterpret_s64_s32(r.val[1]); +} + +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { + return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a)))); +} + +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { + return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a)))); +} + +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { + return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a)))); +} + +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { + return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a)))); +} + +SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) { + return vreinterpret_s64_s16(vqmovn_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); +} + +SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) { + return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); +} + +SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) { + return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); +} + +SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) { + uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[0]); +} + +SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) { + uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); + return vreinterpret_s64_u8(r.val[1]); +} + +SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) { + uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); + return vreinterpret_s64_u16(r.val[0]); +} + +SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) { + uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); + return vreinterpret_s64_u16(r.val[1]); +} + +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) { + return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x)))); +} + +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) { + return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x)))); +} + +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) { + return vreinterpret_s64_s32( + vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x)))); +} + +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) { + return vreinterpret_s64_u32( + vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x)))); +} + +SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { + return vreinterpret_s64_u8( + vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern))); +} + +SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + +SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +} + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { + return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c))); +} + +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { + return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c))); +} + +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { + return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c))); +} + +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { + return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c))); +} + +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { + return vreinterpret_s64_u16( + vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c))); +} + +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { + return vreinterpret_s64_s16( + vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c))); +} + +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { + return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c))); +} + +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { + return vreinterpret_s64_u32( + vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c))); +} + +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { + return vreinterpret_s64_s32( + vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c))); +} + +// The following functions require an immediate. +// Some compilers will check this during optimisation, others wont. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) + +SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { + return vshl_n_s64(a, c * 8); +} + +SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { + return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a; +} + +SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { + return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { + return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { + return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)); +} + +SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { + return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { + return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { + return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)); +} + +SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { + return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { + return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)); +} + +SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { + return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)); +} + +#else + +SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { + return v64_from_64(v64_u64(a) << c * 8); +} + +SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { + return v64_from_64(v64_u64(a) >> c * 8); +} + +SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); } + +SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); } + +SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); } + +SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); } + +SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { + return v64_shr_u16(a, c); +} + +SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { + return v64_shr_s16(a, c); +} + +SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); } + +SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { + return v64_shr_u32(a, c); +} + +SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { + return v64_shr_s32(a, c); +} + +#endif + +#endif /* _V64_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h new file mode 100644 index 000000000..5032238b6 --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h @@ -0,0 +1,919 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V64_INTRINSICS_C_H +#define _V64_INTRINSICS_C_H + +/* Note: This implements the intrinsics in plain, unoptimised C. + Intended for reference, porting or debugging. */ + +#include +#include +#include "./aom_config.h" + +typedef union { + uint8_t u8[8]; + uint16_t u16[4]; + uint32_t u32[2]; + uint64_t u64; + int8_t s8[8]; + int16_t s16[4]; + int32_t s32[2]; + int64_t s64; +} c_v64; + +SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; } + +SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) { + return a.u32[!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; } + +SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) { + return a.s32[!CONFIG_BIG_ENDIAN]; +} + +SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) { + c_v64 t; + t.u32[!CONFIG_BIG_ENDIAN] = x; + t.u32[CONFIG_BIG_ENDIAN] = y; + return t; +} + +SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) { + c_v64 t; + t.u64 = x; + return t; +} + +SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; } + +SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c, + uint16_t d) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + t.u16[0] = a; + t.u16[1] = b; + t.u16[2] = c; + t.u16[3] = d; + } else { + t.u16[3] = a; + t.u16[2] = b; + t.u16[1] = c; + t.u16[0] = d; + } + return t; +} + +SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) { + uint32_t t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 4; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) { + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&a; + int c; + for (c = 0; c < 4; c++) pp[c] = q[c]; +} + +SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 3) { + fprintf(stderr, "Error: Unaligned u32 load at %p\n", p); + abort(); + } + return c_u32_load_unaligned(p); +} + +SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) { + if (SIMD_CHECK && (uintptr_t)p & 3) { + fprintf(stderr, "Error: Unaligned u32 store at %p\n", p); + abort(); + } + c_u32_store_unaligned(p, a); +} + +SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) { + c_v64 t; + uint8_t *pp = (uint8_t *)p; + uint8_t *q = (uint8_t *)&t; + int c; + for (c = 0; c < 8; c++) q[c] = pp[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) { + if (SIMD_CHECK && (uintptr_t)p & 7) { + fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p); + abort(); + } + return c_v64_load_unaligned(p); +} + +SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) { + uint8_t *q = (uint8_t *)p; + uint8_t *r = (uint8_t *)&a; + int c; + for (c = 0; c < 8; c++) q[c] = r[c]; +} + +SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) { + if (SIMD_CHECK && (uintptr_t)p & 7) { + fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p); + abort(); + } + c_v64_store_unaligned(p, a); +} + +SIMD_INLINE c_v64 c_v64_zero() { + c_v64 t; + t.u64 = 0; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) { + c_v64 t; + t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] = + t.u8[7] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) { + c_v64 t; + t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) { + c_v64 t; + t.u32[0] = t.u32[1] = x; + return t; +} + +SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767 + ? 32767 + : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768 + ? -32768 + : (int32_t)a.s16[c] + (int32_t)b.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) { + c_v64 t; + t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]); + t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) + t.u8[c] = (int32_t)a.u8[c] - (int32_t)b.u8[c] < 0 ? 0 : a.u8[c] - b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) { + int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c]; + t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d); + } + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768 + ? -32768 + : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767 + ? 32767 + : (int32_t)a.s16[c] - (int32_t)b.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.u16[c] = + (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) { + c_v64 t; + t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]); + t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) + t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]; + return t; +} + +SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u8[7] = a.u8[7]; + t.u8[6] = b.u8[7]; + t.u8[5] = a.u8[6]; + t.u8[4] = b.u8[6]; + t.u8[3] = a.u8[5]; + t.u8[2] = b.u8[5]; + t.u8[1] = a.u8[4]; + t.u8[0] = b.u8[4]; + } else { + t.u8[7] = a.u8[3]; + t.u8[6] = b.u8[3]; + t.u8[5] = a.u8[2]; + t.u8[4] = b.u8[2]; + t.u8[3] = a.u8[1]; + t.u8[2] = b.u8[1]; + t.u8[1] = a.u8[0]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u16[3] = a.u16[3]; + t.u16[2] = b.u16[3]; + t.u16[1] = a.u16[2]; + t.u16[0] = b.u16[2]; + } else { + t.u16[3] = a.u16[1]; + t.u16[2] = b.u16[1]; + t.u16[1] = a.u16[0]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u32[1] = a.u32[1]; + t.u32[0] = b.u32[1]; + } else { + t.u32[1] = a.u32[0]; + t.u32[0] = b.u32[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1); +} + +SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u8[7] = b.u8[7]; + t.u8[6] = b.u8[5]; + t.u8[5] = b.u8[3]; + t.u8[4] = b.u8[1]; + t.u8[3] = a.u8[7]; + t.u8[2] = a.u8[5]; + t.u8[1] = a.u8[3]; + t.u8[0] = a.u8[1]; + } else { + t.u8[7] = a.u8[6]; + t.u8[6] = a.u8[4]; + t.u8[5] = a.u8[2]; + t.u8[4] = a.u8[0]; + t.u8[3] = b.u8[6]; + t.u8[2] = b.u8[4]; + t.u8[1] = b.u8[2]; + t.u8[0] = b.u8[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1); +} + +SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) { + c_v64 t; + if (mode) { + t.u16[3] = b.u16[3]; + t.u16[2] = b.u16[1]; + t.u16[1] = a.u16[3]; + t.u16[0] = a.u16[1]; + } else { + t.u16[3] = a.u16[2]; + t.u16[2] = a.u16[0]; + t.u16[1] = b.u16[2]; + t.u16[0] = b.u16[0]; + } + return t; +} + +SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1) + : _c_v64_unzip_16(a, b, 0); +} + +SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) { + return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0) + : _c_v64_unzip_16(b, a, 1); +} + +SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.u8[3 + endian]; + t.s16[2] = (int16_t)a.u8[2 + endian]; + t.s16[1] = (int16_t)a.u8[1 + endian]; + t.s16[0] = (int16_t)a.u8[0 + endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.u8[7 - endian]; + t.s16[2] = (int16_t)a.u8[6 - endian]; + t.s16[1] = (int16_t)a.u8[5 - endian]; + t.s16[0] = (int16_t)a.u8[4 - endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.s8[3 + endian]; + t.s16[2] = (int16_t)a.s8[2 + endian]; + t.s16[1] = (int16_t)a.s8[1 + endian]; + t.s16[0] = (int16_t)a.s8[0 + endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) { + c_v64 t; + int endian = !!CONFIG_BIG_ENDIAN * 4; + t.s16[3] = (int16_t)a.s8[7 - endian]; + t.s16[2] = (int16_t)a.s8[6 - endian]; + t.s16[1] = (int16_t)a.s8[5 - endian]; + t.s16[0] = (int16_t)a.s8[4 - endian]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1]; + t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0]; + t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1]; + t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3]; + t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2]; + t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1]; + t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0]; + t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3]; + t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2]; + t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1]; + t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0]; + return t; +} + +SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3]; + t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2]; + t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1]; + t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0]; + t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3]; + t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2]; + t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1]; + t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) { + c_v64 t; + t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2]; + t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2]; + return t; +} + +SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) { + if (SIMD_CHECK && (pattern.u8[c] & ~7)) { + fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n", + pattern.u8[c], c); + abort(); + } + t.u8[c] = + a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7]; + } + return t; +} + +SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) { + return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] + + a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] + + a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0]; +} + +SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) { + return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) + + (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]); +} + +SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) { + return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] + + a.u8[0]; +} + +SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) { + return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0]; +} + +typedef uint32_t c_sad64_internal; + +/* Implementation dependent return value. Result must be finalised with + v64_sad_u8_sum(). + The result for more than 32 v64_sad_u8() calls is undefined. */ +SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; } + +SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a, + c_v64 b) { + int c; + for (c = 0; c < 8; c++) + s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c]; + return s; +} + +SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; } + +typedef uint32_t c_ssd64_internal; + +/* Implementation dependent return value. Result must be finalised with + * v64_ssd_u8_sum(). */ +SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; } + +SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a, + c_v64 b) { + int c; + for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]); + return s; +} + +SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; } + +SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 | b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 ^ b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 & b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) { + c_v64 t; + t.u64 = a.u64 & ~b.u64; + return t; +} + +SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16; + return t; +} + +SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) { + c_v64 t; + t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]); + t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]); + return t; +} + +SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) { + c_v64 t; + t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1]; + t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3]; + return t; +} + +SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) { + c_v64 t; + int32_t u; + u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1]; + t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3]; + t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5]; + t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7]; + t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u; + return t; +} + +SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]); + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined u8 shift left %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined u8 shift right %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 7) { + fprintf(stderr, "Error: Undefined s8 shift right %d\n", n); + abort(); + } + for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: Undefined u16 shift left %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: Undefined u16 shift right %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) { + c_v64 t; + int c; + if (SIMD_CHECK && n > 15) { + fprintf(stderr, "Error: undefined s16 shift right %d\n", n); + abort(); + } + for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined u32 shift left %d\n", n); + abort(); + } + t.u32[1] = a.u32[1] << n; + t.u32[0] = a.u32[0] << n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined u32 shift right %d\n", n); + abort(); + } + t.u32[1] = a.u32[1] >> n; + t.u32[0] = a.u32[0] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) { + c_v64 t; + if (SIMD_CHECK && n > 31) { + fprintf(stderr, "Error: undefined s32 shift right %d\n", n); + abort(); + } + t.s32[1] = a.s32[1] >> n; + t.s32[0] = a.s32[0] >> n; + return t; +} + +SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) { + c_v64 t; + t.u64 = x.u64 >> i * 8; + return t; +} + +SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) { + c_v64 t; + t.u64 = x.u64 << i * 8; + return t; +} + +SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) { + if (SIMD_CHECK && c > 7) { + fprintf(stderr, "Error: undefined alignment %d\n", c); + abort(); + } + return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b; +} + +SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) { + return c_v64_shl_8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) { + return c_v64_shr_u8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) { + return c_v64_shr_s8(a, c); +} + +SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) { + return c_v64_shl_16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) { + return c_v64_shr_u16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) { + return c_v64_shr_s16(a, c); +} + +SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) { + return c_v64_shl_32(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) { + return c_v64_shr_u32(a, c); +} + +SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) { + return c_v64_shr_s32(a, c); +} + +#endif /* _V64_INTRINSICS_C_H */ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h new file mode 100644 index 000000000..8dcc9f6fc --- /dev/null +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h @@ -0,0 +1,470 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _V64_INTRINSICS_H +#define _V64_INTRINSICS_H + +#include +#if defined(__SSSE3__) +#include +#endif +#if defined(__SSE4_1__) +#include +#endif + +typedef __m128i v64; + +SIMD_INLINE uint32_t v64_low_u32(v64 a) { + return (uint32_t)_mm_cvtsi128_si32(a); +} + +SIMD_INLINE uint32_t v64_high_u32(v64 a) { + return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); +} + +SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); } + +SIMD_INLINE int32_t v64_high_s32(v64 a) { + return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4)); +} + +SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { + return _mm_packs_epi32( + _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d), + _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { + return _mm_set_epi32(0, 0, x, y); +} + +SIMD_INLINE v64 v64_from_64(uint64_t x) { +#ifdef __x86_64__ + return _mm_cvtsi64_si128(x); +#else + return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x); +#endif +} + +SIMD_INLINE uint64_t v64_u64(v64 x) { + return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32); +} + +SIMD_INLINE uint32_t u32_load_aligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { + return *((uint32_t *)p); +} + +SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { + *((uint32_t *)p) = a; +} + +SIMD_INLINE v64 v64_load_aligned(const void *p) { + return _mm_loadl_epi64((__m128i *)p); +} + +SIMD_INLINE v64 v64_load_unaligned(const void *p) { + return _mm_loadl_epi64((__m128i *)p); +} + +SIMD_INLINE void v64_store_aligned(void *p, v64 a) { + _mm_storel_epi64((__m128i *)p, a); +} + +SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { + _mm_storel_epi64((__m128i *)p, a); +} + +// The following function requires an immediate. +#if defined(__OPTIMIZE__) && __OPTIMIZE__ +#define v64_align(a, b, c) \ + ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b) +#else +#define v64_align(a, b, c) \ + ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \ + : (b)) +#endif + +SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); } + +SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); } + +SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); } + +SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); } + +SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); } + +SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); } + +SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); } + +SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); } + +SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); } + +SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); } + +SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); } + +SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); } + +SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); } + +SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); } + +SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); } + +SIMD_INLINE v64 v64_abs_s16(v64 a) { +#if defined(__SSSE3__) + return _mm_abs_epi16(a); +#else + return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); +#endif +} + +SIMD_INLINE v64 v64_abs_s8(v64 a) { +#if defined(__SSSE3__) + return _mm_abs_epi8(a); +#else + v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128()); + return _mm_xor_si128(sign, _mm_add_epi8(a, sign)); +#endif +} + +SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); } + +SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8); +} + +SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); } + +SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8); +} + +SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); } + +SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { + return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8); +} + +SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packs_epi32(t, t); +} + +SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packus_epi16(t, t); +} + +SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) { + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packs_epi16(t, t); +} + +SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0f0d0b0907050301LL)); +#else + return _mm_packus_epi16( + _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)), + _mm_setzero_si128()); +#endif +} + +SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0e0c0a0806040200LL)); +#else + return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1)); +#endif +} + +SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0f0e0b0a07060302LL)); +#else + return _mm_packs_epi32( + _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)), + _mm_setzero_si128()); +#endif +} + +SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a), + v64_from_64(0x0d0c090805040100LL)); +#else + return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2)); +#endif +} + +SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { + return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8); +} + +SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { + return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); +} + +SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { + return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8); +} + +SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); +} + +SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) { + return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16); +} + +SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) { + return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8); +} + +SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) { + return _mm_srli_si128( + _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8); +} + +SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { +#if defined(__SSSE3__) + return _mm_shuffle_epi8(x, pattern); +#else + v64 output; + unsigned char *input = (unsigned char *)&x; + unsigned char *index = (unsigned char *)&pattern; + char *selected = (char *)&output; + int counter; + + for (counter = 0; counter < 8; counter++) { + selected[counter] = input[index[counter]]; + } + + return output; +#endif +} + +SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { + __m128i r, r1, r2, z; + z = _mm_setzero_si128(); + r1 = _mm_madd_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(a, z), 8), + _mm_unpacklo_epi8(b, z)); + r2 = _mm_srli_si128(r1, 8); + r = _mm_add_epi32(r1, r2); + r = _mm_add_epi32(r, _mm_srli_si128(r, 4)); + return ((int32_t)v64_low_u32(r)) >> 8; +} + +SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { + __m128i r = _mm_madd_epi16(a, b); +#if defined(__SSE4_1__) && defined(__x86_64__) + __m128i x = _mm_cvtepi32_epi64(r); + return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8))); +#else + return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(r); +#endif +} + +SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { + return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128())); +} + +SIMD_INLINE int64_t v64_hadd_s16(v64 a) { + return v64_dotp_s16(a, v64_dup_16(1)); +} + +typedef v64 sad64_internal; + +SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + v64_sad_u8_sum(). + The result for more than 32 v64_sad_u8() calls is undefined. */ +SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { + return _mm_add_epi64(s, _mm_sad_epu8(a, b)); +} + +SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); } + +typedef v64 ssd64_internal; + +SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); } + +/* Implementation dependent return value. Result must be finalised with + * v64_ssd_u8_sum(). */ +SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { + v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b)); + v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b)); + v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h)); + return _mm_add_epi64( + s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4)))); +} + +SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); } + +SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); } + +SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); } + +SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); } + +SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); } + +SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); } + +SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); } + +SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_mullo_epi32(a, b); +#else + return _mm_unpacklo_epi32( + _mm_mul_epu32(a, b), + _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4))); +#endif +} + +SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); } + +SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { +#if defined(__SSSE3__) + return _mm_maddubs_epi16(a, b); +#else + __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), + _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)); + return _mm_packs_epi32(t, t); +#endif +} + +SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); } + +SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { + return _mm_sub_epi8(_mm_avg_epu8(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1))); +} + +SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); } + +SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); } + +SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); } + +SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_min_epi8(a, b); +#else + v64 mask = _mm_cmplt_epi8(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { +#if defined(__SSE4_1__) + return _mm_max_epi8(a, b); +#else + v64 mask = _mm_cmplt_epi8(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); } + +SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); } + +SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); } + +SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); } + +SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); } + +SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); } + +SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); } + +SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)), + _mm_sll_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { + return _mm_and_si128(_mm_set1_epi8(0xff >> c), + _mm_srl_epi16(a, _mm_cvtsi32_si128(c))); +} + +SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { + return _mm_packs_epi16( + _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a); +} + +SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { + return _mm_sll_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { + return _mm_srl_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { + return _mm_sra_epi16(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { + return _mm_sll_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { + return _mm_srl_epi32(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { + return _mm_sra_epi32(a, _mm_cvtsi32_si128(c)); +} + +/* These intrinsics require immediate values, so we must use #defines + to enforce that. */ +#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c) +#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8) +#define v64_shl_n_8(a, c) \ + _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) +#define v64_shr_n_u8(a, c) \ + _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c)) +#define v64_shr_n_s8(a, c) \ + _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a) +#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c) +#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c) +#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c) +#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c) +#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c) +#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c) + +#endif /* _V64_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c new file mode 100644 index 000000000..141bf01c7 --- /dev/null +++ b/third_party/aom/aom_dsp/ssim.c @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/ssim.h" +#include "aom_ports/mem.h" +#include "aom_ports/system_state.h" + +void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 16; i++, s += sp, r += rp) { + for (j = 0; j < 16; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t *sum_s, uint32_t *sum_r, + uint32_t *sum_sq_s, uint32_t *sum_sq_r, + uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +#endif // CONFIG_HIGHBITDEPTH + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 +static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2 +static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2 +static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2 +static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2 + +static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, + uint32_t sum_sq_r, uint32_t sum_sxr, int count, + uint32_t bd) { + int64_t ssim_n, ssim_d; + int64_t c1, c2; + if (bd == 8) { + // scale the constants by number of pixels + c1 = (cc1 * count * count) >> 12; + c2 = (cc2 * count * count) >> 12; + } else if (bd == 10) { + c1 = (cc1_10 * count * count) >> 12; + c2 = (cc2_10 * count * count) >> 12; + } else if (bd == 12) { + c1 = (cc1_12 * count * count) >> 12; + c2 = (cc2_12 * count * count) >> 12; + } else { + c1 = c2 = 0; + assert(0); + } + + ssim_n = (2 * sum_s * sum_r + c1) * + ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + + ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * + ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + + (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); +} + +#if CONFIG_HIGHBITDEPTH +static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, + int rp, uint32_t bd, uint32_t shift) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), + sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); +} +#endif // CONFIG_HIGHBITDEPTH + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +static double aom_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +#if CONFIG_HIGHBITDEPTH +static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, + int stride_img1, int stride_img2, int width, + int height, uint32_t bd, uint32_t shift) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, + CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd, + shift); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} +#endif // CONFIG_HIGHBITDEPTH + +double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight) { + double a, b, c; + double ssimv; + + a = aom_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, + dest->y_stride, source->y_crop_width, source->y_crop_height); + + b = aom_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, source->uv_crop_height); + + c = aom_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, source->uv_crop_height); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} + +// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity +// +// Re working out the math -> +// +// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) / +// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2)) +// +// mean(x) = sum(x) / n +// +// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n) +// +// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n) +// +// ssim(x,y) = +// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) * +// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+ +// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2))) +// +// factoring out n*n +// +// ssim(x,y) = +// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) / +// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) * +// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2)) +// +// Replace c1 with n*n * c1 for the final step that leads to this code: +// The final step scales by 12 bits so we don't lose precision in the constants. + +static double ssimv_similarity(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) / + (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1); + + // Since these variables are unsigned sums, convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} + +// The first term of the ssim metric is a luminance factor. +// +// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1) +// +// This luminance factor is super sensitive to the dark side of luminance +// values and completely insensitive on the white side. check out 2 sets +// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60 +// 2*250*252/ (250^2+252^2) => .99999997 +// +// As a result in this tweaked version of the calculation in which the +// luminance is taken as percentage off from peak possible. +// +// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count +// +static double ssimv_similarity2(const Ssimv *sv, int64_t n) { + // Scale the constants by number of pixels. + const int64_t c1 = (cc1 * n * n) >> 12; + const int64_t c2 = (cc2 * n * n) >> 12; + + const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n; + const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1); + + // Since these variables are unsigned, sums convert to double so + // math is done in double arithmetic. + const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) / + (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + + n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2); + + return l * v; +} +static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, Ssimv *sv) { + aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r, + &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr); +} + +double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency) { + double dssim_total = 0; + double ssim_total = 0; + double ssim2_total = 0; + double inconsistency_total = 0; + int i, j; + int c = 0; + double norm; + double old_ssim_total = 0; + aom_clear_system_state(); + // We can sample points as frequently as we like start with 1 per 4x4. + for (i = 0; i < height; + i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { + for (j = 0; j < width; j += 4, ++c) { + Ssimv sv = { 0 }; + double ssim; + double ssim2; + double dssim; + uint32_t var_new; + uint32_t var_old; + uint32_t mean_new; + uint32_t mean_old; + double ssim_new; + double ssim_old; + + // Not sure there's a great way to handle the edge pixels + // in ssim when using a window. Seems biased against edge pixels + // however you handle this. This uses only samples that are + // fully in the frame. + if (j + 8 <= width && i + 8 <= height) { + ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv); + } + + ssim = ssimv_similarity(&sv, 64); + ssim2 = ssimv_similarity2(&sv, 64); + + sv.ssim = ssim2; + + // dssim is calculated to use as an actual error metric and + // is scaled up to the same range as sum square error. + // Since we are subsampling every 16th point maybe this should be + // *16 ? + dssim = 255 * 255 * (1 - ssim2) / 2; + + // Here I introduce a new error metric: consistency-weighted + // SSIM-inconsistency. This metric isolates frames where the + // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much + // sharper or blurrier than the others. Higher values indicate a + // temporally inconsistent SSIM. There are two ideas at work: + // + // 1) 'SSIM-inconsistency': the total inconsistency value + // reflects how much SSIM values are changing between this + // source / reference frame pair and the previous pair. + // + // 2) 'consistency-weighted': weights de-emphasize areas in the + // frame where the scene content has changed. Changes in scene + // content are detected via changes in local variance and local + // mean. + // + // Thus the overall measure reflects how inconsistent the SSIM + // values are, over consistent regions of the frame. + // + // The metric has three terms: + // + // term 1 -> uses change in scene Variance to weight error score + // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term 2 -> uses change in local scene luminance to weight error + // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2) + // larger changes from one frame to the next mean we care + // less about consistency. + // + // term3 -> measures inconsistency in ssim scores between frames + // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2). + // + // This term compares the ssim score for the same location in 2 + // subsequent frames. + var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64; + var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64; + mean_new = sv.sum_s; + mean_old = sv2[c].sum_s; + ssim_new = sv.ssim; + ssim_old = sv2[c].ssim; + + if (do_inconsistency) { + // We do the metric once for every 4x4 block in the image. Since + // we are scaling the error to SSE for use in a psnr calculation + // 1.0 = 4x4x255x255 the worst error we can possibly have. + static const double kScaling = 4. * 4 * 255 * 255; + + // The constants have to be non 0 to avoid potential divide by 0 + // issues other than that they affect kind of a weighting between + // the terms. No testing of what the right terms should be has been + // done. + static const double c1 = 1, c2 = 1, c3 = 1; + + // This measures how much consistent variance is in two consecutive + // source frames. 1.0 means they have exactly the same variance. + const double variance_term = + (2.0 * var_old * var_new + c1) / + (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1); + + // This measures how consistent the local mean are between two + // consecutive frames. 1.0 means they have exactly the same mean. + const double mean_term = + (2.0 * mean_old * mean_new + c2) / + (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2); + + // This measures how consistent the ssims of two + // consecutive frames is. 1.0 means they are exactly the same. + double ssim_term = + pow((2.0 * ssim_old * ssim_new + c3) / + (ssim_old * ssim_old + ssim_new * ssim_new + c3), + 5); + + double this_inconsistency; + + // Floating point math sometimes makes this > 1 by a tiny bit. + // We want the metric to scale between 0 and 1.0 so we can convert + // it to an snr scaled value. + if (ssim_term > 1) ssim_term = 1; + + // This converts the consistency metric to an inconsistency metric + // ( so we can scale it like psnr to something like sum square error. + // The reason for the variance and mean terms is the assumption that + // if there are big changes in the source we shouldn't penalize + // inconsistency in ssim scores a bit less as it will be less visible + // to the user. + this_inconsistency = (1 - ssim_term) * variance_term * mean_term; + + this_inconsistency *= kScaling; + inconsistency_total += this_inconsistency; + } + sv2[c] = sv; + ssim_total += ssim; + ssim2_total += ssim2; + dssim_total += dssim; + + old_ssim_total += ssim_old; + } + old_ssim_total += 0; + } + + norm = 1. / (width / 4) / (height / 4); + ssim_total *= norm; + ssim2_total *= norm; + m->ssim2 = ssim2_total; + m->ssim = ssim_total; + if (old_ssim_total == 0) inconsistency_total = 0; + + m->ssimc = inconsistency_total; + + m->dssim = dssim_total; + return inconsistency_total; +} + +#if CONFIG_HIGHBITDEPTH +double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd) { + double a, b, c; + double ssimv; + uint32_t shift = 0; + + assert(bd >= in_bd); + shift = bd - in_bd; + + a = aom_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, + dest->y_stride, source->y_crop_width, + source->y_crop_height, in_bd, shift); + + b = aom_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); + + c = aom_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, + dest->uv_stride, source->uv_crop_width, + source->uv_crop_height, in_bd, shift); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} + +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h new file mode 100644 index 000000000..902735e50 --- /dev/null +++ b/third_party/aom/aom_dsp/ssim.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_SSIM_H_ +#define AOM_DSP_SSIM_H_ + +#define MAX_SSIM_DB 100.0; + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./aom_config.h" +#include "aom_scale/yv12config.h" + +// metrics used for calculating ssim, ssim2, dssim, and ssimc +typedef struct { + // source sum ( over 8x8 region ) + uint32_t sum_s; + + // reference sum (over 8x8 region ) + uint32_t sum_r; + + // source sum squared ( over 8x8 region ) + uint32_t sum_sq_s; + + // reference sum squared (over 8x8 region ) + uint32_t sum_sq_r; + + // sum of source times reference (over 8x8 region) + uint32_t sum_sxr; + + // calculated ssim score between source and reference + double ssim; +} Ssimv; + +// metrics collected on a frame basis +typedef struct { + // ssim consistency error metric ( see code for explanation ) + double ssimc; + + // standard ssim + double ssim; + + // revised ssim ( see code for explanation) + double ssim2; + + // ssim restated as an error metric like sse + double dssim; + + // dssim converted to decibels + double dssimd; + + // ssimc converted to decibels + double ssimcd; +} Metrics; + +double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, + int img2_pitch, int width, int height, Ssimv *sv2, + Metrics *m, int do_inconsistency); + +double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight); + +double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *ssim_y, + double *ssim_u, double *ssim_v, uint32_t bd, + uint32_t in_bd); + +#if CONFIG_HIGHBITDEPTH +double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, + const YV12_BUFFER_CONFIG *dest, double *weight, + uint32_t bd, uint32_t in_bd); +#endif // CONFIG_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_SSIM_H_ diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c new file mode 100644 index 000000000..8dda96efb --- /dev/null +++ b/third_party/aom/aom_dsp/subtract.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +void aom_subtract_block_c(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r, c; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride, int bd) { + int r, c; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + (void)bd; + + for (r = 0; r < rows; r++) { + for (c = 0; c < cols; c++) { + diff[c] = src[c] - pred[c]; + } + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c new file mode 100644 index 000000000..b9155fdc0 --- /dev/null +++ b/third_party/aom/aom_dsp/sum_squares.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width, + int height) { + int r, c; + uint64_t ss = 0; + + for (r = 0; r < height; r++) { + for (c = 0; c < width; c++) { + const int16_t v = src[c]; + ss += v * v; + } + src += src_stride; + } + + return ss; +} + +uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) { + uint64_t ss = 0; + do { + const int16_t v = *src++; + ss += v * v; + } while (--n); + + return ss; +} diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h new file mode 100644 index 000000000..a5e964aad --- /dev/null +++ b/third_party/aom/aom_dsp/txfm_common.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_TXFM_COMMON_H_ +#define AOM_DSP_TXFM_COMMON_H_ + +#include "aom_dsp/aom_dsp_common.h" + +// Constants and Macros used by all idct/dct functions +#define DCT_CONST_BITS 14 +#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) + +#define UNIT_QUANT_SHIFT 2 +#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) + +// Constants: +// for (int i = 1; i< 32; ++i) +// printf("static const int cospi_%d_64 = %.0f;\n", i, +// round(16384 * cos(i*M_PI/64))); +// Note: sin(k*Pi/64) = cos((32-k)*Pi/64) +static const tran_high_t cospi_1_64 = 16364; +static const tran_high_t cospi_2_64 = 16305; +static const tran_high_t cospi_3_64 = 16207; +static const tran_high_t cospi_4_64 = 16069; +static const tran_high_t cospi_5_64 = 15893; +static const tran_high_t cospi_6_64 = 15679; +static const tran_high_t cospi_7_64 = 15426; +static const tran_high_t cospi_8_64 = 15137; +static const tran_high_t cospi_9_64 = 14811; +static const tran_high_t cospi_10_64 = 14449; +static const tran_high_t cospi_11_64 = 14053; +static const tran_high_t cospi_12_64 = 13623; +static const tran_high_t cospi_13_64 = 13160; +static const tran_high_t cospi_14_64 = 12665; +static const tran_high_t cospi_15_64 = 12140; +static const tran_high_t cospi_16_64 = 11585; +static const tran_high_t cospi_17_64 = 11003; +static const tran_high_t cospi_18_64 = 10394; +static const tran_high_t cospi_19_64 = 9760; +static const tran_high_t cospi_20_64 = 9102; +static const tran_high_t cospi_21_64 = 8423; +static const tran_high_t cospi_22_64 = 7723; +static const tran_high_t cospi_23_64 = 7005; +static const tran_high_t cospi_24_64 = 6270; +static const tran_high_t cospi_25_64 = 5520; +static const tran_high_t cospi_26_64 = 4756; +static const tran_high_t cospi_27_64 = 3981; +static const tran_high_t cospi_28_64 = 3196; +static const tran_high_t cospi_29_64 = 2404; +static const tran_high_t cospi_30_64 = 1606; +static const tran_high_t cospi_31_64 = 804; + +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const tran_high_t sinpi_1_9 = 5283; +static const tran_high_t sinpi_2_9 = 9929; +static const tran_high_t sinpi_3_9 = 13377; +static const tran_high_t sinpi_4_9 = 15212; + +// 16384 * sqrt(2) +static const tran_high_t Sqrt2 = 23170; + +#endif // AOM_DSP_TXFM_COMMON_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c new file mode 100644 index 000000000..9fc0db783 --- /dev/null +++ b/third_party/aom/aom_dsp/variance.c @@ -0,0 +1,1249 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/variance.h" +#include "aom_dsp/aom_filter.h" + +uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride) { + int distortion = 0; + int r, c; + + for (r = 0; r < 4; ++r) { + for (c = 0; c < 4; ++c) { + int diff = a[c] - b[c]; + distortion += diff * diff; + } + + a += a_stride; + b += b_stride; + } + + return distortion; +} + +uint32_t aom_get_mb_ss_c(const int16_t *a) { + unsigned int i, sum = 0; + + for (i = 0; i < 256; ++i) { + sum += a[i] * a[i]; + } + + return sum; +} + +uint32_t aom_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse); +} + +uint32_t aom_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return aom_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse); +} + +uint32_t aom_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse); +} + +static void variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h, uint32_t *sse, int *sum) { + int i, j; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } +} + +uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { + uint32_t sse; + int sum; + variance(a, a_stride, b, b_stride, w, h, &sse, &sum); + return sse; +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the first-pass of 2-D separable filter. +// +// Produces int16_t output to retain precision for the next pass. Two filter +// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is +// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). +// It defines the offset required to move from one input to the next. +static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal +// or vertical direction to produce the filtered output block. Used to implement +// the second-pass of 2-D separable filter. +// +// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two +// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the +// filter is applied horizontally (pixel_step = 1) or vertically +// (pixel_step = stride). It defines the offset required to move from one input +// to the next. Output is 8-bit. +static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + b[j] = ROUND_POWER_OF_TWO( + (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS); + ++a; + } + + a += src_pixels_per_line - output_width; + b += output_width; + } +} + +#define VAR(W, H) \ + uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define SUBPIX_VAR(W, H) \ + uint32_t aom_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ + } + +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + \ + return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ + } + +/* Identical to the variance call except it takes an additional parameter, sum, + * and returns that value using pass-by-reference instead of returning + * sse - sum^2 / w*h + */ +#define GET_VAR(W, H) \ + void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + int *sum) { \ + variance(a, a_stride, b, b_stride, W, H, sse, sum); \ + } + +/* Identical to the variance call except it does not calculate the + * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in + * variable. + */ +#define MSE(W, H) \ + uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse; \ + } + +/* All three forms of the variance are available in the same sizes. */ +#define VARIANCES(W, H) \ + VAR(W, H) \ + SUBPIX_VAR(W, H) \ + SUBPIX_AVG_VAR(W, H) + +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +VARIANCES(128, 128) +VARIANCES(128, 64) +VARIANCES(64, 128) +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION +VARIANCES(64, 64) +VARIANCES(64, 32) +VARIANCES(32, 64) +VARIANCES(32, 32) +VARIANCES(32, 16) +VARIANCES(16, 32) +VARIANCES(16, 16) +VARIANCES(16, 8) +VARIANCES(8, 16) +VARIANCES(8, 8) +VARIANCES(8, 4) +VARIANCES(4, 8) +VARIANCES(4, 4) +VARIANCES(4, 2) +VARIANCES(2, 4) +VARIANCES(2, 2) + +GET_VAR(16, 16) +GET_VAR(8, 8) + +MSE(16, 16) +MSE(16, 8) +MSE(8, 16) +MSE(8, 8) + +void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +// Get pred block from up-sampled reference. +void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height, + const uint8_t *ref, int ref_stride) { + int i, j, k; + int stride = ref_stride << 3; + + for (i = 0; i < height; i++) { + for (j = 0, k = 0; j < width; j++, k += 8) { + comp_pred[j] = ref[k]; + } + comp_pred += width; + ref += stride; + } +} + +void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + const int tmp = ref[(j << 3)] + pred[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += stride; + } +} + +#if CONFIG_HIGHBITDEPTH +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int i, j; + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + *sum = 0; + *sse = 0; + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int diff = a[j] - b[j]; + *sum += diff; + *sse += diff * diff; + } + a += a_stride; + b += b_stride; + } +} + +uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int w, int h) { + uint64_t sse; + int64_t sum; + highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum); + return sse; +} + +static void highbd_8_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)sse_long; + *sum = (int)sum_long; +} + +static void highbd_10_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); +} + +static void highbd_12_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint32_t *sse, int *sum) { + uint64_t sse_long = 0; + int64_t sum_long = 0; + highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); + *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); +} + +#define HIGHBD_VAR(W, H) \ + uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, \ + uint32_t *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_GET_VAR(S) \ + void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } \ + \ + void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } \ + \ + void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse, int *sum) { \ + highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \ + } + +#define HIGHBD_MSE(W, H) \ + uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + uint32_t *sse) { \ + int sum; \ + highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \ + return *sse; \ + } + +void aom_highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + + ++src_ptr; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +void aom_highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter) { + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; ++j) { + output_ptr[j] = ROUND_POWER_OF_TWO( + (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], + FILTER_BITS); + ++src_ptr; + } + + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +#define HIGHBD_SUBPIX_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + dst, dst_stride, sse); \ + } + +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } + +/* All three forms of the variance are available in the same sizes. */ +#define HIGHBD_VARIANCES(W, H) \ + HIGHBD_VAR(W, H) \ + HIGHBD_SUBPIX_VAR(W, H) \ + HIGHBD_SUBPIX_AVG_VAR(W, H) + +#if CONFIG_AV1 && CONFIG_EXT_PARTITION +HIGHBD_VARIANCES(128, 128) +HIGHBD_VARIANCES(128, 64) +HIGHBD_VARIANCES(64, 128) +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION +HIGHBD_VARIANCES(64, 64) +HIGHBD_VARIANCES(64, 32) +HIGHBD_VARIANCES(32, 64) +HIGHBD_VARIANCES(32, 32) +HIGHBD_VARIANCES(32, 16) +HIGHBD_VARIANCES(16, 32) +HIGHBD_VARIANCES(16, 16) +HIGHBD_VARIANCES(16, 8) +HIGHBD_VARIANCES(8, 16) +HIGHBD_VARIANCES(8, 8) +HIGHBD_VARIANCES(8, 4) +HIGHBD_VARIANCES(4, 8) +HIGHBD_VARIANCES(4, 4) +HIGHBD_VARIANCES(4, 2) +HIGHBD_VARIANCES(2, 4) +HIGHBD_VARIANCES(2, 2) + +HIGHBD_GET_VAR(8) +HIGHBD_GET_VAR(16) + +HIGHBD_MSE(16, 16) +HIGHBD_MSE(16, 8) +HIGHBD_MSE(8, 16) +HIGHBD_MSE(8, 8) + +void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[j]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} + +void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height, + const uint8_t *ref8, int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + comp_pred[j] = ref[(j << 3)]; + } + comp_pred += width; + ref += stride; + } +} + +void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, + int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + const int tmp = pred[j] + ref[(j << 3)]; + comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + } + comp_pred += width; + pred += width; + ref += stride; + } +} +#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_AV1 && CONFIG_EXT_INTER +void masked_variance(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, const uint8_t *m, int m_stride, int w, int h, + unsigned int *sse, int *sum) { + int i, j; + + int64_t sum64 = 0; + uint64_t sse64 = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = (a[j] - b[j]) * (m[j]); + sum64 += diff; + sse64 += diff * diff; + } + + a += a_stride; + b += b_stride; + m += m_stride; + } + sum64 = (sum64 >= 0) ? sum64 : -sum64; + *sum = (int)ROUND_POWER_OF_TWO(sum64, 6); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12); +} + +#define MASK_VAR(W, H) \ + unsigned int aom_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +#define MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, msk, \ + msk_stride, sse); \ + } + +MASK_VAR(4, 4) +MASK_SUBPIX_VAR(4, 4) + +MASK_VAR(4, 8) +MASK_SUBPIX_VAR(4, 8) + +MASK_VAR(8, 4) +MASK_SUBPIX_VAR(8, 4) + +MASK_VAR(8, 8) +MASK_SUBPIX_VAR(8, 8) + +MASK_VAR(8, 16) +MASK_SUBPIX_VAR(8, 16) + +MASK_VAR(16, 8) +MASK_SUBPIX_VAR(16, 8) + +MASK_VAR(16, 16) +MASK_SUBPIX_VAR(16, 16) + +MASK_VAR(16, 32) +MASK_SUBPIX_VAR(16, 32) + +MASK_VAR(32, 16) +MASK_SUBPIX_VAR(32, 16) + +MASK_VAR(32, 32) +MASK_SUBPIX_VAR(32, 32) + +MASK_VAR(32, 64) +MASK_SUBPIX_VAR(32, 64) + +MASK_VAR(64, 32) +MASK_SUBPIX_VAR(64, 32) + +MASK_VAR(64, 64) +MASK_SUBPIX_VAR(64, 64) + +#if CONFIG_EXT_PARTITION +MASK_VAR(64, 128) +MASK_SUBPIX_VAR(64, 128) + +MASK_VAR(128, 64) +MASK_SUBPIX_VAR(128, 64) + +MASK_VAR(128, 128) +MASK_SUBPIX_VAR(128, 128) +#endif // CONFIG_EXT_PARTITION + +#if CONFIG_HIGHBITDEPTH +void highbd_masked_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m, + int m_stride, int w, int h, uint64_t *sse, + int64_t *sum) { + int i, j; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + const int diff = (a[j] - b[j]) * (m[j]); + *sum += (int64_t)diff; + *sse += (int64_t)diff * diff; + } + + a += a_stride; + b += b_stride; + m += m_stride; + } + *sum = (*sum >= 0) ? *sum : -*sum; + *sum = ROUND_POWER_OF_TWO(*sum, 6); + *sse = ROUND_POWER_OF_TWO(*sse, 12); +} + +void highbd_masked_variance(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, const uint8_t *m, int m_stride, int w, + int h, unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, + &sse64, &sum64); + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +void highbd_10_masked_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, + &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +void highbd_12_masked_variance(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, + &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HIGHBD_MASK_VAR(W, H) \ + unsigned int aom_highbd_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, \ + &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \ + sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_masked_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \ + sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_masked_variance##W##x##H##_c( \ + CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ + } \ + \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_10_masked_variance##W##x##H##_c( \ + CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ + } \ + \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_12_masked_variance##W##x##H##_c( \ + CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ + } + +HIGHBD_MASK_VAR(4, 4) +HIGHBD_MASK_SUBPIX_VAR(4, 4) + +HIGHBD_MASK_VAR(4, 8) +HIGHBD_MASK_SUBPIX_VAR(4, 8) + +HIGHBD_MASK_VAR(8, 4) +HIGHBD_MASK_SUBPIX_VAR(8, 4) + +HIGHBD_MASK_VAR(8, 8) +HIGHBD_MASK_SUBPIX_VAR(8, 8) + +HIGHBD_MASK_VAR(8, 16) +HIGHBD_MASK_SUBPIX_VAR(8, 16) + +HIGHBD_MASK_VAR(16, 8) +HIGHBD_MASK_SUBPIX_VAR(16, 8) + +HIGHBD_MASK_VAR(16, 16) +HIGHBD_MASK_SUBPIX_VAR(16, 16) + +HIGHBD_MASK_VAR(16, 32) +HIGHBD_MASK_SUBPIX_VAR(16, 32) + +HIGHBD_MASK_VAR(32, 16) +HIGHBD_MASK_SUBPIX_VAR(32, 16) + +HIGHBD_MASK_VAR(32, 32) +HIGHBD_MASK_SUBPIX_VAR(32, 32) + +HIGHBD_MASK_VAR(32, 64) +HIGHBD_MASK_SUBPIX_VAR(32, 64) + +HIGHBD_MASK_VAR(64, 32) +HIGHBD_MASK_SUBPIX_VAR(64, 32) + +HIGHBD_MASK_VAR(64, 64) +HIGHBD_MASK_SUBPIX_VAR(64, 64) + +#if CONFIG_EXT_PARTITION +HIGHBD_MASK_VAR(64, 128) +HIGHBD_MASK_SUBPIX_VAR(64, 128) + +HIGHBD_MASK_VAR(128, 64) +HIGHBD_MASK_SUBPIX_VAR(128, 64) + +HIGHBD_MASK_VAR(128, 128) +HIGHBD_MASK_SUBPIX_VAR(128, 128) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AV1 && CONFIG_EXT_INTER + +#if CONFIG_AV1 && CONFIG_MOTION_VAR +static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); + *sum += diff; + *sse += diff * diff; + } + + pre += pre_stride; + wsrc += w; + mask += w; + } +} + +#define OBMC_VAR(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +#define OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ + } + +OBMC_VAR(4, 4) +OBMC_SUBPIX_VAR(4, 4) + +OBMC_VAR(4, 8) +OBMC_SUBPIX_VAR(4, 8) + +OBMC_VAR(8, 4) +OBMC_SUBPIX_VAR(8, 4) + +OBMC_VAR(8, 8) +OBMC_SUBPIX_VAR(8, 8) + +OBMC_VAR(8, 16) +OBMC_SUBPIX_VAR(8, 16) + +OBMC_VAR(16, 8) +OBMC_SUBPIX_VAR(16, 8) + +OBMC_VAR(16, 16) +OBMC_SUBPIX_VAR(16, 16) + +OBMC_VAR(16, 32) +OBMC_SUBPIX_VAR(16, 32) + +OBMC_VAR(32, 16) +OBMC_SUBPIX_VAR(32, 16) + +OBMC_VAR(32, 32) +OBMC_SUBPIX_VAR(32, 32) + +OBMC_VAR(32, 64) +OBMC_SUBPIX_VAR(32, 64) + +OBMC_VAR(64, 32) +OBMC_SUBPIX_VAR(64, 32) + +OBMC_VAR(64, 64) +OBMC_SUBPIX_VAR(64, 64) + +#if CONFIG_EXT_PARTITION +OBMC_VAR(64, 128) +OBMC_SUBPIX_VAR(64, 128) + +OBMC_VAR(128, 64) +OBMC_SUBPIX_VAR(128, 64) + +OBMC_VAR(128, 128) +OBMC_SUBPIX_VAR(128, 128) +#endif // CONFIG_EXT_PARTITION + +#if CONFIG_HIGHBITDEPTH +static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + uint64_t *sse, int64_t *sum) { + int i, j; + uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12); + *sum += diff; + *sse += diff * diff; + } + + pre += pre_stride; + wsrc += w; + mask += w; + } +} + +static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64; + uint64_t sse64; + highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64); + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HIGHBD_OBMC_VAR(W, H) \ + unsigned int aom_highbd_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \ + wsrc, mask, sse); \ + } \ + \ + unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } \ + \ + unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \ + W, wsrc, mask, sse); \ + } + +HIGHBD_OBMC_VAR(4, 4) +HIGHBD_OBMC_SUBPIX_VAR(4, 4) + +HIGHBD_OBMC_VAR(4, 8) +HIGHBD_OBMC_SUBPIX_VAR(4, 8) + +HIGHBD_OBMC_VAR(8, 4) +HIGHBD_OBMC_SUBPIX_VAR(8, 4) + +HIGHBD_OBMC_VAR(8, 8) +HIGHBD_OBMC_SUBPIX_VAR(8, 8) + +HIGHBD_OBMC_VAR(8, 16) +HIGHBD_OBMC_SUBPIX_VAR(8, 16) + +HIGHBD_OBMC_VAR(16, 8) +HIGHBD_OBMC_SUBPIX_VAR(16, 8) + +HIGHBD_OBMC_VAR(16, 16) +HIGHBD_OBMC_SUBPIX_VAR(16, 16) + +HIGHBD_OBMC_VAR(16, 32) +HIGHBD_OBMC_SUBPIX_VAR(16, 32) + +HIGHBD_OBMC_VAR(32, 16) +HIGHBD_OBMC_SUBPIX_VAR(32, 16) + +HIGHBD_OBMC_VAR(32, 32) +HIGHBD_OBMC_SUBPIX_VAR(32, 32) + +HIGHBD_OBMC_VAR(32, 64) +HIGHBD_OBMC_SUBPIX_VAR(32, 64) + +HIGHBD_OBMC_VAR(64, 32) +HIGHBD_OBMC_SUBPIX_VAR(64, 32) + +HIGHBD_OBMC_VAR(64, 64) +HIGHBD_OBMC_SUBPIX_VAR(64, 64) + +#if CONFIG_EXT_PARTITION +HIGHBD_OBMC_VAR(64, 128) +HIGHBD_OBMC_SUBPIX_VAR(64, 128) + +HIGHBD_OBMC_VAR(128, 64) +HIGHBD_OBMC_SUBPIX_VAR(128, 64) + +HIGHBD_OBMC_VAR(128, 128) +HIGHBD_OBMC_SUBPIX_VAR(128, 128) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_HIGHBITDEPTH +#endif // CONFIG_AV1 && CONFIG_MOTION_VAR diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h new file mode 100644 index 000000000..7c925cfac --- /dev/null +++ b/third_party/aom/aom_dsp/variance.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_VARIANCE_H_ +#define AOM_DSP_VARIANCE_H_ + +#include "./aom_config.h" + +#include "aom/aom_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FILTER_BITS 7 +#define FILTER_WEIGHT 128 + +typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride); + +typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *second_pred); + +typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, + int b_stride, int n); + +typedef void (*aom_sad_multi_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sad_array); + +typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *const b_array[], + int b_stride, unsigned int *sad_array); + +typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse); + +typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride, + int xoffset, int yoffset, + const uint8_t *b, int b_stride, + unsigned int *sse); + +typedef unsigned int (*aom_subp_avg_variance_fn_t)( + const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, + int b_stride, unsigned int *sse, const uint8_t *second_pred); + +#if CONFIG_AV1 && CONFIG_EXT_INTER +typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *msk_ptr, + int msk_stride); +typedef unsigned int (*aom_masked_variance_fn_t)( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *msk, int msk_stride, unsigned int *sse); +typedef unsigned int (*aom_masked_subpixvariance_fn_t)( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse); +#endif // CONFIG_AV1 && CONFIG_EXT_INTER + +#if CONFIG_AV1 && CONFIG_MOTION_VAR +typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, + const int32_t *wsrc, + const int32_t *msk); +typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred, + int pred_stride, + const int32_t *wsrc, + const int32_t *msk, + unsigned int *sse); +typedef unsigned int (*aom_obmc_subpixvariance_fn_t)( + const uint8_t *pred, int pred_stride, int xoffset, int yoffset, + const int32_t *wsrc, const int32_t *msk, unsigned int *sse); +#endif // CONFIG_AV1 && CONFIG_MOTION_VAR + +#if CONFIG_AV1 +typedef struct aom_variance_vtable { + aom_sad_fn_t sdf; + aom_sad_avg_fn_t sdaf; + aom_variance_fn_t vf; + aom_subpixvariance_fn_t svf; + aom_subp_avg_variance_fn_t svaf; + aom_sad_multi_fn_t sdx3f; + aom_sad_multi_fn_t sdx8f; + aom_sad_multi_d_fn_t sdx4df; +#if CONFIG_EXT_INTER + aom_masked_sad_fn_t msdf; + aom_masked_variance_fn_t mvf; + aom_masked_subpixvariance_fn_t msvf; +#endif // CONFIG_EXT_INTER +#if CONFIG_MOTION_VAR + aom_obmc_sad_fn_t osdf; + aom_obmc_variance_fn_t ovf; + aom_obmc_subpixvariance_fn_t osvf; +#endif // CONFIG_MOTION_VAR +} aom_variance_fn_ptr_t; +#endif // CONFIG_AV1 + +void aom_highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, uint16_t *output_ptr, + unsigned int src_pixels_per_line, int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter); + +void aom_highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, uint16_t *output_ptr, + unsigned int src_pixels_per_line, unsigned int pixel_step, + unsigned int output_height, unsigned int output_width, + const uint8_t *filter); + +uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h); + +#if CONFIG_HIGHBITDEPTH +uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, int w, int h); +#endif // CONFIG_HIGHBITDEPTH + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_VARIANCE_H_ diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c new file mode 100644 index 000000000..4067b0b53 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" + +#if HAVE_SSE2 +filter8_1dfunction aom_filter_block1d16_v8_sse2; +filter8_1dfunction aom_filter_block1d16_h8_sse2; +filter8_1dfunction aom_filter_block1d8_v8_sse2; +filter8_1dfunction aom_filter_block1d8_h8_sse2; +filter8_1dfunction aom_filter_block1d4_v8_sse2; +filter8_1dfunction aom_filter_block1d4_h8_sse2; +filter8_1dfunction aom_filter_block1d16_v8_avg_sse2; +filter8_1dfunction aom_filter_block1d16_h8_avg_sse2; +filter8_1dfunction aom_filter_block1d8_v8_avg_sse2; +filter8_1dfunction aom_filter_block1d8_h8_avg_sse2; +filter8_1dfunction aom_filter_block1d4_v8_avg_sse2; +filter8_1dfunction aom_filter_block1d4_h8_avg_sse2; + +filter8_1dfunction aom_filter_block1d16_v2_sse2; +filter8_1dfunction aom_filter_block1d16_h2_sse2; +filter8_1dfunction aom_filter_block1d8_v2_sse2; +filter8_1dfunction aom_filter_block1d8_h2_sse2; +filter8_1dfunction aom_filter_block1d4_v2_sse2; +filter8_1dfunction aom_filter_block1d4_h2_sse2; +filter8_1dfunction aom_filter_block1d16_v2_avg_sse2; +filter8_1dfunction aom_filter_block1d16_h2_avg_sse2; +filter8_1dfunction aom_filter_block1d8_v2_avg_sse2; +filter8_1dfunction aom_filter_block1d8_h2_avg_sse2; +filter8_1dfunction aom_filter_block1d4_v2_avg_sse2; +filter8_1dfunction aom_filter_block1d4_h2_avg_sse2; + +// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); + +// void aom_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, sse2); +FUN_CONV_2D(avg_, sse2); + +#if CONFIG_HIGHBITDEPTH && ARCH_X86_64 +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_avg_sse2; + +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_avg_sse2; +highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2; + +// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_avg_vert_sse2(const uint8_t *src, +// ptrdiff_t src_stride, +// uint8_t *dst, +// ptrdiff_t dst_stride, +// const int16_t *filter_x, +// int x_step_q4, +// const int16_t *filter_y, +// int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); +HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); +HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + sse2); + +// void aom_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +// void aom_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h, int bd); +HIGH_FUN_CONV_2D(, sse2); +HIGH_FUN_CONV_2D(avg_, sse2); + +#if CONFIG_LOOP_RESTORATION +// The SSE2 highbd convolve functions can deal with coefficients up to 32767. +// So redirect highbd_convolve8_add_src to regular highbd_convolve8. +void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + assert(x_step_q4 == 16); + assert(y_step_q4 == 16); + ((int16_t *)filter_x)[3] += 128; + ((int16_t *)filter_y)[3] += 128; + aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); + ((int16_t *)filter_x)[3] -= 128; + ((int16_t *)filter_y)[3] -= 128; +} +#endif // CONFIG_LOOP_RESTORATION +#endif // CONFIG_HIGHBITDEPTH && ARCH_X86_64 +#endif // HAVE_SSE2 diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm new file mode 100644 index 000000000..4d3142867 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm @@ -0,0 +1,345 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro convolve_fn 1-2 +%ifidn %1, avg +%define AUX_XMM_REGS 4 +%else +%define AUX_XMM_REGS 0 +%endif +%ifidn %2, highbd +%define pavg pavgw +cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h, bd +%else +%define pavg pavgb +cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h +%endif + mov r4d, dword wm +%ifidn %2, highbd + shl r4d, 1 + shl srcq, 1 + shl src_strideq, 1 + shl dstq, 1 + shl dst_strideq, 1 +%else + cmp r4d, 4 + je .w4 +%endif + cmp r4d, 8 + je .w8 + cmp r4d, 16 + je .w16 + cmp r4d, 32 + je .w32 + +%if CONFIG_AV1 && CONFIG_EXT_PARTITION + cmp r4d, 64 + je .w64 +%ifidn %2, highbd + cmp r4d, 128 + je .w128 + +.w256: + mov r4d, dword hm +.loop256: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + movu m0, [srcq+128] + movu m1, [srcq+128+16] + movu m2, [srcq+128+32] + movu m3, [srcq+128+48] +%ifidn %1, avg + pavg m0, [dstq+128] + pavg m1, [dstq+128+16] + pavg m2, [dstq+128+32] + pavg m3, [dstq+128+48] +%endif + mova [dstq+128 ], m0 + mova [dstq+128+16], m1 + mova [dstq+128+32], m2 + mova [dstq+128+48], m3 + movu m0, [srcq+128+64] + movu m1, [srcq+128+80] + movu m2, [srcq+128+96] + movu m3, [srcq+128+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+128+64] + pavg m1, [dstq+128+80] + pavg m2, [dstq+128+96] + pavg m3, [dstq+128+112] +%endif + mova [dstq+128+64], m0 + mova [dstq+128+80], m1 + mova [dstq+128+96], m2 + mova [dstq+128+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop256 + RET +%endif + +.w128: + mov r4d, dword hm +.loop128: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop128 + RET + +%else ; CONFIG_AV1 && CONFIG_EXT_PARTITION + +%ifidn %2, highbd + cmp r4d, 64 + je .w64 + + mov r4d, dword hm +.loop128: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + movu m0, [srcq+64] + movu m1, [srcq+80] + movu m2, [srcq+96] + movu m3, [srcq+112] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq+64] + pavg m1, [dstq+80] + pavg m2, [dstq+96] + pavg m3, [dstq+112] +%endif + mova [dstq+64], m0 + mova [dstq+80], m1 + mova [dstq+96], m2 + mova [dstq+112], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop128 + RET +%endif +%endif ; CONFIG_AV1 && CONFIG_EXT_PARTITION + +.w64: + mov r4d, dword hm +.loop64: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+32] + movu m3, [srcq+48] + add srcq, src_strideq +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+16] + pavg m2, [dstq+32] + pavg m3, [dstq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + add dstq, dst_strideq + sub r4d, 1 + jnz .loop64 + RET + +.w32: + mov r4d, dword hm +.loop32: + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + lea srcq, [srcq+src_strideq*2] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq +16] + pavg m2, [dstq+dst_strideq] + pavg m3, [dstq+dst_strideq+16] +%endif + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + lea dstq, [dstq+dst_strideq*2] + sub r4d, 2 + jnz .loop32 + RET + +.w16: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop16: + movu m0, [srcq] + movu m1, [srcq+src_strideq] + movu m2, [srcq+src_strideq*2] + movu m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + pavg m0, [dstq] + pavg m1, [dstq+dst_strideq] + pavg m2, [dstq+dst_strideq*2] + pavg m3, [dstq+r6q] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq ], m1 + mova [dstq+dst_strideq*2], m2 + mova [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop16 + RET + +.w8: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop8: + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop8 + RET + +%ifnidn %2, highbd +.w4: + mov r4d, dword hm + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] +.loop4: + movd m0, [srcq] + movd m1, [srcq+src_strideq] + movd m2, [srcq+src_strideq*2] + movd m3, [srcq+r5q] + lea srcq, [srcq+src_strideq*4] +%ifidn %1, avg + movd m4, [dstq] + movd m5, [dstq+dst_strideq] + movd m6, [dstq+dst_strideq*2] + movd m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 +%endif + movd [dstq ], m0 + movd [dstq+dst_strideq ], m1 + movd [dstq+dst_strideq*2], m2 + movd [dstq+r6q ], m3 + lea dstq, [dstq+dst_strideq*4] + sub r4d, 4 + jnz .loop4 + RET +%endif +%endmacro + +INIT_XMM sse2 +convolve_fn copy +convolve_fn avg +%if CONFIG_HIGHBITDEPTH +convolve_fn copy, highbd +convolve_fn avg, highbd +%endif diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm new file mode 100644 index 000000000..e6d357ba3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm @@ -0,0 +1,965 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro HIGH_GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm6 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + punpcklwd xmm1, xmm7 + + movdqa k0k6, xmm0 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + movdqa k1k7, xmm1 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) + +%endm + +%macro HIGH_APPLY_FILTER_4 1 + punpcklwd xmm0, xmm6 ;two row in one register + punpcklwd xmm1, xmm7 + punpcklwd xmm2, xmm5 + punpcklwd xmm3, xmm4 + + pmaddwd xmm0, k0k6 ;multiply the filter factors + pmaddwd xmm1, k1k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm3, k3k4 + + paddd xmm0, xmm1 ;sum + paddd xmm0, xmm2 + paddd xmm0, xmm3 + + paddd xmm0, krd ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movq [rdi], xmm0 +%endm + +%macro HIGH_GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + punpcklwd xmm0, xmm1 + punpckhwd xmm6, xmm7 + punpckhwd xmm2, xmm5 + punpckhwd xmm3, xmm4 + + movdqa k0k1, xmm0 ;store filter factors on stack + movdqa k6k7, xmm6 + movdqa k2k5, xmm2 + movdqa k3k4, xmm3 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + ;Compute max and min values of a pixel + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm0, rdx + movq xmm1, rcx + pshufd xmm0, xmm0, 0b + movdqa xmm2, xmm0 + psllw xmm0, xmm1 + psubw xmm0, xmm2 + pxor xmm1, xmm1 + movdqa max, xmm0 ;max value (for clamping) + movdqa min, xmm1 ;min value (for clamping) +%endm + +%macro LOAD_VERT_8 1 + movdqu xmm0, [rsi + %1] ;0 + movdqu xmm1, [rsi + rax + %1] ;1 + movdqu xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movdqu xmm7, [rsi + rdx * 2 + %1] ;7 + movdqu xmm2, [rsi + rax + %1] ;2 + movdqu xmm3, [rsi + rax * 2 + %1] ;3 + movdqu xmm4, [rsi + rdx + %1] ;4 + movdqu xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro HIGH_APPLY_FILTER_8 2 + movdqu temp, xmm4 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm1 + punpckhwd xmm4, xmm1 + movdqa xmm1, xmm6 + punpcklwd xmm6, xmm7 + punpckhwd xmm1, xmm7 + movdqa xmm7, xmm2 + punpcklwd xmm2, xmm5 + punpckhwd xmm7, xmm5 + + movdqu xmm5, temp + movdqu temp, xmm4 + movdqa xmm4, xmm3 + punpcklwd xmm3, xmm5 + punpckhwd xmm4, xmm5 + movdqu xmm5, temp + + pmaddwd xmm0, k0k1 + pmaddwd xmm5, k0k1 + pmaddwd xmm6, k6k7 + pmaddwd xmm1, k6k7 + pmaddwd xmm2, k2k5 + pmaddwd xmm7, k2k5 + pmaddwd xmm3, k3k4 + pmaddwd xmm4, k3k4 + + paddd xmm0, xmm6 + paddd xmm0, xmm2 + paddd xmm0, xmm3 + paddd xmm5, xmm1 + paddd xmm5, xmm7 + paddd xmm5, xmm4 + + paddd xmm0, krd ;rounding + paddd xmm5, krd + psrad xmm0, 7 ;shift + psrad xmm5, 7 + packssdw xmm0, xmm5 ;pack back to word + + ;clamp the values + pminsw xmm0, max + pmaxsw xmm0, min + +%if %1 + movdqu xmm1, [rdi + %2] + pavgw xmm0, xmm1 +%endif + movdqu [rdi + %2], xmm0 +%endm + +;void aom_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 0, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d4_v8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movq xmm0, [rsi] ;load src: row 0 + movq xmm1, [rsi + rax] ;1 + movq xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2] ;7 + movq xmm2, [rsi + rax] ;2 + movq xmm3, [rsi + rax * 2] ;3 + movq xmm4, [rsi + rdx] ;4 + movq xmm5, [rsi + rax * 4] ;5 + + HIGH_APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d8_v8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_v8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rbx, [rbx + rbx] + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + HIGH_APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 16 + HIGH_APPLY_FILTER_8 1, 16 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 0, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d4_h8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 7 + %define k0k6 [rsp + 16 * 0] + %define k2k5 [rsp + 16 * 1] + %define k3k4 [rsp + 16 * 2] + %define k1k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define max [rsp + 16 * 5] + %define min [rsp + 16 * 6] + + HIGH_GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm4, [rsi + 2] + movdqa xmm1, xmm0 + movdqa xmm6, xmm4 + movdqa xmm7, xmm4 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm4 + + psrldq xmm1, 2 + psrldq xmm6, 4 + psrldq xmm7, 6 + psrldq xmm2, 4 + psrldq xmm3, 6 + psrldq xmm5, 2 + + HIGH_APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 7 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d8_h8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_h8_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 8 + %define k0k1 [rsp + 16 * 0] + %define k6k7 [rsp + 16 * 1] + %define k2k5 [rsp + 16 * 2] + %define k3k4 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define temp [rsp + 16 * 5] + %define max [rsp + 16 * 6] + %define min [rsp + 16 * 7] + + HIGH_GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + lea rax, [rax + rax] ;bytes per line + lea rdx, [rdx + rdx] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 6] ;load src + movdqu xmm1, [rsi - 4] + movdqu xmm2, [rsi - 2] + movdqu xmm3, [rsi] + movdqu xmm4, [rsi + 2] + movdqu xmm5, [rsi + 4] + movdqu xmm6, [rsi + 6] + movdqu xmm7, [rsi + 8] + + HIGH_APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 10] ;load src + movdqu xmm1, [rsi + 12] + movdqu xmm2, [rsi + 14] + movdqu xmm3, [rsi + 16] + movdqu xmm4, [rsi + 18] + movdqu xmm5, [rsi + 20] + movdqu xmm6, [rsi + 22] + movdqu xmm7, [rsi + 24] + + HIGH_APPLY_FILTER_8 1, 16 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 8 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..9e2ec748c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm @@ -0,0 +1,497 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro HIGH_GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklwd xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm5, rdx + movq xmm2, rcx + pshufd xmm5, xmm5, 0b + movdqa xmm1, xmm5 + psllw xmm5, xmm2 + psubw xmm5, xmm1 ;max value (for clamping) + pxor xmm2, xmm2 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_4 1 + + punpcklwd xmm0, xmm1 ;two row in one register + pmaddwd xmm0, xmm4 ;multiply the filter factors + + paddd xmm0, xmm3 ;rounding + psrad xmm0, 7 ;shift + packssdw xmm0, xmm0 ;pack to word + + ;clamp the values + pminsw xmm0, xmm5 + pmaxsw xmm0, xmm2 + +%if %1 + movq xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + + movq [rdi], xmm0 + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%if ARCH_X86_64 +%macro HIGH_GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x00000040 + + movdqa xmm6, [rdx] ;load filters + + pshuflw xmm7, xmm6, 11111111b ;k3 + pshufhw xmm6, xmm6, 0b ;k4 + psrldq xmm6, 8 + punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + mov rdx, 0x00010001 + movsxd rcx, DWORD PTR arg(6) ;bps + movq xmm8, rdx + movq xmm5, rcx + pshufd xmm8, xmm8, 0b + movdqa xmm1, xmm8 + psllw xmm8, xmm5 + psubw xmm8, xmm1 ;max value (for clamping) + pxor xmm5, xmm5 ;min value (for clamping) + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro HIGH_APPLY_FILTER_8 1 + movdqa xmm6, xmm0 + punpckhwd xmm6, xmm1 + punpcklwd xmm0, xmm1 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + + paddd xmm6, xmm4 ;rounding + paddd xmm0, xmm4 ;rounding + psrad xmm6, 7 ;shift + psrad xmm0, 7 ;shift + packssdw xmm0, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + +%if %1 + movdqu xmm1, [rdi] + pavgw xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm + +%macro HIGH_APPLY_FILTER_16 1 + movdqa xmm9, xmm0 + movdqa xmm6, xmm2 + punpckhwd xmm9, xmm1 + punpckhwd xmm6, xmm3 + punpcklwd xmm0, xmm1 + punpcklwd xmm2, xmm3 + + pmaddwd xmm9, xmm7 + pmaddwd xmm6, xmm7 + pmaddwd xmm0, xmm7 + pmaddwd xmm2, xmm7 + + paddd xmm9, xmm4 ;rounding + paddd xmm6, xmm4 + paddd xmm0, xmm4 + paddd xmm2, xmm4 + + psrad xmm9, 7 ;shift + psrad xmm6, 7 + psrad xmm0, 7 + psrad xmm2, 7 + + packssdw xmm0, xmm9 ;pack back to word + packssdw xmm2, xmm6 ;pack back to word + + ;clamp the values + pminsw xmm0, xmm8 + pmaxsw xmm0, xmm5 + pminsw xmm2, xmm8 + pmaxsw xmm2, xmm5 + +%if %1 + movdqu xmm1, [rdi] + movdqu xmm3, [rdi + 16] + pavgw xmm0, xmm1 + pavgw xmm2, xmm3 +%endif + movdqu [rdi], xmm0 ;store the result + movdqu [rdi + 16], xmm2 ;store the result + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + dec rcx +%endm +%endif + +global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm2, [rsi + 16] + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(aom_highbd_filter_block1d4_v2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movq xmm0, [rsi] ;load src + movq xmm1, [rsi + 2*rax] + + HIGH_APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(aom_highbd_filter_block1d8_v2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + + HIGH_APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_v2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + 2*rax] ;1 + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 2*rax + 16] + + HIGH_APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif + +global sym(aom_highbd_filter_block1d4_h2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 2 + + HIGH_APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +%if ARCH_X86_64 +global sym(aom_highbd_filter_block1d8_h2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 8 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + + HIGH_APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_highbd_filter_block1d16_h2_avg_sse2) PRIVATE +sym(aom_highbd_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 9 + push rsi + push rdi + ; end prolog + + HIGH_GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 2] + movdqu xmm2, [rsi + 16] + movdqu xmm3, [rsi + 18] + + HIGH_APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c new file mode 100644 index 000000000..61476b8be --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" +#include "aom_ports/mem.h" + +// filters for 16_h8 and 16_v8 +DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +#if defined(__clang__) +#if (__clang_major__ > 0 && __clang_major__ < 3) || \ + (__clang_major__ == 3 && __clang_minor__ <= 3) || \ + (defined(__APPLE__) && defined(__apple_build_version__) && \ + ((__clang_major__ == 4 && __clang_minor__ <= 2) || \ + (__clang_major__ == 5 && __clang_minor__ == 0))) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#else // clang > 3.3, and not 5.0 on macosx. +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // clang <= 3.3 +#elif defined(__GNUC__) +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6) +#define MM256_BROADCASTSI128_SI256(x) \ + _mm_broadcastsi128_si256((__m128i const *)&(x)) +#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7 +#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x) +#else // gcc > 4.7 +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // gcc <= 4.6 +#else // !(gcc || clang) +#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) +#endif // __clang__ + +static void aom_filter_block1d16_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); + filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); + filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); + srcReg32b1 = _mm256_inserti128_si256( + srcReg32b1, + _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), + 1); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16( + srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); + srcReg32b2 = _mm256_inserti128_si256( + srcReg32b2, + _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), + 1); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16( + srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + // filter the source buffer + srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16( + srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + srcRegFilt32b2_1 = _mm256_adds_epi16( + srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, + _mm256_castsi256_si128(srcRegFilt32b1_1)); + + // save the next 16 bits + _mm_store_si128((__m128i *)(output_ptr + output_pitch), + _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + + // reading the next 16 bytes + // (part of it was being read by earlier read) + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + // filter the source buffer + srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2_1 = + _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); + + srcRegFilt2_1 = + _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1); + } +} + +static void aom_filter_block1d16_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg64; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr))); + srcReg32b2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); + srcReg32b3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); + srcReg32b4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); + srcReg32b5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + srcReg32b6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, + _mm256_castsi256_si128(srcReg32b2), 1); + srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, + _mm256_castsi256_si128(srcReg32b3), 1); + srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, + _mm256_castsi256_si128(srcReg32b4), 1); + srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, + _mm256_castsi256_si128(srcReg32b5), 1); + srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, + _mm256_castsi256_si128(srcReg32b6), 1); + srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, + _mm256_castsi256_si128(srcReg32b7), 1); + + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); + + // save + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); + + // save + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + // save + srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); + + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); + + // add and saturate the results together + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_min_epi16(srcReg32b8, srcReg32b12)); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, + _mm256_max_epi16(srcReg32b8, srcReg32b12)); + + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); + + src_ptr += src_stride; + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1)); + + // save the next 16 bits + _mm_store_si128((__m128i *)(output_ptr + out_pitch), + _mm256_extractf128_si256(srcReg32b1, 1)); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b1 = srcReg32b3; + srcReg32b11 = srcReg32b2; + srcReg32b3 = srcReg32b5; + srcReg32b2 = srcReg32b4; + srcReg32b5 = srcReg32b7; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; + __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + srcRegFilt7 = + _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt7 = + _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); + + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); + srcRegFilt3 = + _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, srcRegFilt1); + } +} + +#if HAVE_AVX2 && HAVE_SSSE3 +filter8_1dfunction aom_filter_block1d4_v8_ssse3; +#if ARCH_X86_64 +filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; +#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3 +#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3 +#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3 +#else // ARCH_X86 +filter8_1dfunction aom_filter_block1d8_v8_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_ssse3; +#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3 +#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3 +#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3 +#endif // ARCH_X86_64 +filter8_1dfunction aom_filter_block1d16_v2_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_ssse3; +#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3 +#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3 +#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3 +#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3 +#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3 +#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3 +#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3 +// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); + +// void aom_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, avx2); +#endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c new file mode 100644 index 000000000..be37738df --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -0,0 +1,920 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/convolve.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" + +// filters only for the 4_h8 convolution +DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +// filters for 8_h8 and 16_h8 +DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +// These are reused by the avx2 intrinsics. +filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; + +void aom_filter_block1d4_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i firstFilters, secondFilters, shuffle1, shuffle2; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, srcReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter into the first lane + firstFilters = _mm_shufflelo_epi16(filtersReg, 0); + // duplicate only the third 16 bit in the filter into the first lane + secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); + // duplicate only the seconds 16 bits in the filter into the second lane + // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 + firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); + // duplicate only the forth 16 bits in the filter into the second lane + // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 + secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); + + // loading the local filters + shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); + shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // extract the higher half of the lane + srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); + srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); + + minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); + + // add and saturate all the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + src_ptr += src_pixels_per_line; + + // save only 4 bytes + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d8_h8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; + __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; + __m128i addFilterReg64, filtersReg, minReg; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 128 bit register + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 128 bit register + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 128 bit register + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 128 bit register + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + filt1Reg = _mm_load_si128((__m128i const *)filt1_global); + filt2Reg = _mm_load_si128((__m128i const *)filt2_global); + filt3Reg = _mm_load_si128((__m128i const *)filt3_global); + filt4Reg = _mm_load_si128((__m128i const *)filt4_global); + + for (i = 0; i < output_height; i++) { + srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + + // filter the source buffer + srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); + srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); + srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); + srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); + + // add and saturate all the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bits + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr += src_pixels_per_line; + + // save only 8 bytes + _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + + output_ptr += output_pitch; + } +} + +void aom_filter_block1d8_v8_intrin_ssse3( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i addFilterReg64, filtersReg, minReg; + __m128i firstFilters, secondFilters, thirdFilters, forthFilters; + __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; + __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; + __m128i srcReg8; + unsigned int i; + + // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 + addFilterReg64 = _mm_set1_epi32((int)0x0400040u); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + + // duplicate only the first 16 bits in the filter + firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); + // duplicate only the second 16 bits in the filter + secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); + // duplicate only the third 16 bits in the filter + thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); + // duplicate only the forth 16 bits in the filter + forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); + + // load the first 7 rows of 8 bytes + srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); + srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + + for (i = 0; i < output_height; i++) { + // load the last 8 bytes + srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the result together + srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); + srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); + + // merge the result together + srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); + srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); + srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); + srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); + srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); + + // add and saturate the results together + minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); + srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); + + // shift by 7 bit each 16 bit + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); + + // shrink to 8 bit each 16 bits + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); + + src_ptr += src_pitch; + + // shift down a row + srcReg1 = srcReg2; + srcReg2 = srcReg3; + srcReg3 = srcReg4; + srcReg4 = srcReg5; + srcReg5 = srcReg6; + srcReg6 = srcReg7; + srcReg7 = srcReg8; + + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); + + output_ptr += out_pitch; + } +} + +filter8_1dfunction aom_filter_block1d16_v8_ssse3; +filter8_1dfunction aom_filter_block1d16_h8_ssse3; +filter8_1dfunction aom_filter_block1d8_v8_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_ssse3; +filter8_1dfunction aom_filter_block1d4_v8_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_ssse3; +filter8_1dfunction aom_filter_block1d16_v8_avg_ssse3; +filter8_1dfunction aom_filter_block1d16_h8_avg_ssse3; +filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3; +filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3; +#if CONFIG_LOOP_RESTORATION +filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3; +filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3; +#endif + +filter8_1dfunction aom_filter_block1d16_v2_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_ssse3; +filter8_1dfunction aom_filter_block1d16_v2_avg_ssse3; +filter8_1dfunction aom_filter_block1d16_h2_avg_ssse3; +filter8_1dfunction aom_filter_block1d8_v2_avg_ssse3; +filter8_1dfunction aom_filter_block1d8_h2_avg_ssse3; +filter8_1dfunction aom_filter_block1d4_v2_avg_ssse3; +filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3; + +// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); +FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + ssse3); + +#if CONFIG_LOOP_RESTORATION +FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_, + ssse3); +FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v, + src - src_stride * 3, add_src_, ssse3); +#endif + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ + const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ + const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ + const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ + \ + const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ + const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ + const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ + const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ + \ + out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ + out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ + out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ + out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ + out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ + out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ + out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ + out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ + } + +static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *x_filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); + const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); + const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); + const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); + const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); + const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); + const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); + const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); + // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 + const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); + // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 + const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); + // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 + const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); + // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 + const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); + // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 + const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); + // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 + const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); + // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); + const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); + const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); + const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)dst, temp); +} + +static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride) { + __m128i A, B, C, D, E, F, G, H; + + A = _mm_loadl_epi64((const __m128i *)src); + B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); + C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); + E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); + F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); + G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); + H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); + + TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H); + + _mm_storel_epi64((__m128i *)dst, A); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G); + _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H); +} + +static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + // This function processes 8x8 areas. The intermediate height is not always + // a multiple of 8, so force it to be a multiple of 8 here. + y = h + (8 - (h & 0x7)); + + do { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 8) { + // process 8 src_x steps + for (z = 0; z < 8; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); + } else { + int i; + for (i = 0; i < 8; ++i) { + temp[z * 8 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 8x8 filters values back to dst + transpose8x8_to_dst(temp, 8, dst + x, dst_stride); + } + + src += src_stride * 8; + dst += dst_stride * 8; + } while (y -= 8); +} + +static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + // TRANSPOSE... + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // + // TO + // + // 00 10 20 30 + // 01 11 21 31 + // 02 12 22 32 + // 03 13 23 33 + // 04 14 24 34 + // 05 15 25 35 + // 06 16 26 36 + // 07 17 27 37 + // + // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 + const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); + // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 + const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); + // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 + const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 + const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); + // 02 03 12 13 22 23 32 33 + const __m128i s3s2 = _mm_srli_si128(s1s0, 8); + // 06 07 16 17 26 27 36 37 + const __m128i s7s6 = _mm_srli_si128(s5s4, 8); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 4 bytes + *(int *)dst = _mm_cvtsi128_si32(temp); +} + +static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride) { + __m128i A = _mm_cvtsi32_si128(*(const int *)src); + __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); + __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); + __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); + // 00 10 01 11 02 12 03 13 + const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); + // 20 30 21 31 22 32 23 33 + const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + A = _mm_unpacklo_epi16(tr0_0, tr0_1); + B = _mm_srli_si128(A, 4); + C = _mm_srli_si128(A, 8); + D = _mm_srli_si128(A, 12); + + *(int *)(dst) = _mm_cvtsi128_si32(A); + *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); + *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); + *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); +} + +static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); + int x, y, z; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; y += 4) { + int x_q4 = x0_q4; + for (x = 0; x < w; x += 4) { + // process 4 src_x steps + for (z = 0; z < 4; ++z) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + if (x_q4 & SUBPEL_MASK) { + filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); + } else { + int i; + for (i = 0; i < 4; ++i) { + temp[z * 4 + i] = src_x[i * src_stride + 3]; + } + } + x_q4 += x_step_q4; + } + + // transpose the 4x4 filters values back to dst + transpose4x4_to_dst(temp, 4, dst + x, dst_stride); + } + + src += src_stride * 4; + dst += dst_stride * 4; + } +} + +static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); + const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); + const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); + const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); + const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); + const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); + const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); + const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); + const __m128i s1s0 = _mm_unpacklo_epi8(A, B); + const __m128i s3s2 = _mm_unpacklo_epi8(C, D); + const __m128i s5s4 = _mm_unpacklo_epi8(E, F); + const __m128i s7s6 = _mm_unpacklo_epi8(G, H); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 4 bytes + *(int *)dst = _mm_cvtsi128_si32(temp); +} + +static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + + if (y_q4 & SUBPEL_MASK) { + filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + + y_q4 += y_step_q4; + } +} + +static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); + const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); + const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); + const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); + const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); + const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); + const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); + const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + const __m128i s1s0 = _mm_unpacklo_epi8(A, B); + const __m128i s3s2 = _mm_unpacklo_epi8(C, D); + const __m128i s5s4 = _mm_unpacklo_epi8(E, F); + const __m128i s7s6 = _mm_unpacklo_epi8(G, H); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); + const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); + const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); + const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); + // add and saturate the results together + const __m128i min_x2x1 = _mm_min_epi16(x2, x1); + const __m128i max_x2x1 = _mm_max_epi16(x2, x1); + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, min_x2x1); + temp = _mm_adds_epi16(temp, max_x2x1); + // round and shift by 7 bit each 16 bit + temp = _mm_mulhrs_epi16(temp, k_256); + // shrink to 8 bit each 16 bits + temp = _mm_packus_epi16(temp, temp); + // save only 8 bytes convolve result + _mm_storel_epi64((__m128i *)dst, temp); +} + +static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + if (y_q4 & SUBPEL_MASK) { + filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + y_q4 += y_step_q4; + } +} + +static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *dst, const int16_t *filter, int w) { + const __m128i k_256 = _mm_set1_epi16(1 << 8); + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); + const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); + const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); + const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); + int i; + + for (i = 0; i < w; i += 16) { + const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); + const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); + const __m128i C = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); + const __m128i D = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); + const __m128i E = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); + const __m128i F = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); + const __m128i G = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); + const __m128i H = + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); + // merge the result together + const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); + const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); + const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); + const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); + const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); + const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); + const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); + // add and saturate the results together + const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); + const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); + // merge the result together + const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); + const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); + const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); + // merge the result together + const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); + const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); + // multiply 2 adjacent elements with the filter and add the result + const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); + const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); + // add and saturate the results together + __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); + __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); + + // add and saturate the results together + temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); + temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); + // round and shift by 7 bit each 16 bit + temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); + temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + temp_hi = _mm_packus_epi16(temp_lo, temp_hi); + src_ptr += 16; + // save 16 bytes convolve result + _mm_store_si128((__m128i *)&dst[i], temp_hi); + } +} + +static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + int y; + int y_q4 = y0_q4; + + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (y = 0; y < h; ++y) { + const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + if (y_q4 & SUBPEL_MASK) { + filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, + w); + } else { + memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); + } + y_q4 += y_step_q4; + } +} + +static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, + int y0_q4, int y_step_q4, int w, int h) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + // --Require an additional 8 rows for the horiz_w8 transpose tail. + DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + if (w >= 8) { + scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, + x_step_q4, w, intermediate_height); + } else { + scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, + x_step_q4, w, intermediate_height); + } + + if (w >= 16) { + scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, + y_step_q4, w, h); + } else if (w == 8) { + scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, + y_step_q4, w, h); + } else { + scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, + y_step_q4, w, h); + } +} + +static const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static int get_filter_offset(const int16_t *f, const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + +void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, + int x_step_q4, const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, filters_y, y0_q4, y_step_q4, w, h); +} + +// void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +// void aom_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, +// uint8_t *dst, ptrdiff_t dst_stride, +// const int16_t *filter_x, int x_step_q4, +// const int16_t *filter_y, int y_step_q4, +// int w, int h); +FUN_CONV_2D(, ssse3); +FUN_CONV_2D(avg_, ssse3); +#if CONFIG_LOOP_RESTORATION +FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3); +#endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm new file mode 100644 index 000000000..b946010d3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm @@ -0,0 +1,990 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;Note: tap3 and tap4 have to be applied and added after other taps to avoid +;overflow. + +%macro GET_FILTERS_4 0 + mov rdx, arg(5) ;filter ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + psrldq xmm7, 8 + pshuflw xmm4, xmm7, 0b ;k4 + pshuflw xmm5, xmm7, 01010101b ;k5 + pshuflw xmm6, xmm7, 10101010b ;k6 + pshuflw xmm7, xmm7, 11111111b ;k7 + + punpcklqdq xmm0, xmm1 + punpcklqdq xmm2, xmm3 + punpcklqdq xmm5, xmm4 + punpcklqdq xmm6, xmm7 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm2 + movdqa k5k4, xmm5 + movdqa k6k7, xmm6 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro APPLY_FILTER_4 1 + punpckldq xmm0, xmm1 ;two row in one register + punpckldq xmm6, xmm7 + punpckldq xmm2, xmm3 + punpckldq xmm5, xmm4 + + punpcklbw xmm0, zero ;unpack to word + punpcklbw xmm6, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + + pmullw xmm0, k0k1 ;multiply the filter factors + pmullw xmm6, k6k7 + pmullw xmm2, k2k3 + pmullw xmm5, k5k4 + + paddsw xmm0, xmm6 ;sum + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + psrldq xmm2, 8 + paddsw xmm0, xmm5 + psrldq xmm5, 8 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 +%endm + +%macro GET_FILTERS 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + pshuflw xmm0, xmm7, 0b ;k0 + pshuflw xmm1, xmm7, 01010101b ;k1 + pshuflw xmm2, xmm7, 10101010b ;k2 + pshuflw xmm3, xmm7, 11111111b ;k3 + pshufhw xmm4, xmm7, 0b ;k4 + pshufhw xmm5, xmm7, 01010101b ;k5 + pshufhw xmm6, xmm7, 10101010b ;k6 + pshufhw xmm7, xmm7, 11111111b ;k7 + + punpcklwd xmm0, xmm0 + punpcklwd xmm1, xmm1 + punpcklwd xmm2, xmm2 + punpcklwd xmm3, xmm3 + punpckhwd xmm4, xmm4 + punpckhwd xmm5, xmm5 + punpckhwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movdqa k0, xmm0 ;store filter factors on stack + movdqa k1, xmm1 + movdqa k2, xmm2 + movdqa k3, xmm3 + movdqa k4, xmm4 + movdqa k5, xmm5 + movdqa k6, xmm6 + movdqa k7, xmm7 + + movq xmm6, rcx + pshufd xmm6, xmm6, 0 + movdqa krd, xmm6 ;rounding + + pxor xmm7, xmm7 + movdqa zero, xmm7 +%endm + +%macro LOAD_VERT_8 1 + movq xmm0, [rsi + %1] ;0 + movq xmm1, [rsi + rax + %1] ;1 + movq xmm6, [rsi + rdx * 2 + %1] ;6 + lea rsi, [rsi + rax] + movq xmm7, [rsi + rdx * 2 + %1] ;7 + movq xmm2, [rsi + rax + %1] ;2 + movq xmm3, [rsi + rax * 2 + %1] ;3 + movq xmm4, [rsi + rdx + %1] ;4 + movq xmm5, [rsi + rax * 4 + %1] ;5 +%endm + +%macro APPLY_FILTER_8 2 + punpcklbw xmm0, zero + punpcklbw xmm1, zero + punpcklbw xmm6, zero + punpcklbw xmm7, zero + punpcklbw xmm2, zero + punpcklbw xmm5, zero + punpcklbw xmm3, zero + punpcklbw xmm4, zero + + pmullw xmm0, k0 + pmullw xmm1, k1 + pmullw xmm6, k6 + pmullw xmm7, k7 + pmullw xmm2, k2 + pmullw xmm5, k5 + pmullw xmm3, k3 + pmullw xmm4, k4 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm6 + paddsw xmm0, xmm7 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm3 + paddsw xmm0, xmm4 + + paddsw xmm0, krd ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi + %2] + pavgb xmm0, xmm1 +%endif + movq [rdi + %2], xmm0 +%endm + +;void aom_filter_block1d4_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d4_v8_sse2) PRIVATE +sym(aom_filter_block1d4_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d8_v8_sse2) PRIVATE +sym(aom_filter_block1d8_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_v8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d16_v8_sse2) PRIVATE +sym(aom_filter_block1d16_v8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 0, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 0, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_v8_avg_sse2) PRIVATE +sym(aom_filter_block1d4_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movd xmm0, [rsi] ;load src: row 0 + movd xmm1, [rsi + rax] ;1 + movd xmm6, [rsi + rdx * 2] ;6 + lea rsi, [rsi + rax] + movd xmm7, [rsi + rdx * 2] ;7 + movd xmm2, [rsi + rax] ;2 + movd xmm3, [rsi + rax * 2] ;3 + movd xmm4, [rsi + rdx] ;4 + movd xmm5, [rsi + rax * 4] ;5 + + APPLY_FILTER_4 1 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v8_avg_sse2) PRIVATE +sym(aom_filter_block1d8_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + + lea rdi, [rdi + rbx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v8_avg_sse2) PRIVATE +sym(aom_filter_block1d16_v8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rbx, DWORD PTR arg(3) ;out_pitch + lea rdx, [rax + rax * 2] + movsxd rcx, DWORD PTR arg(4) ;output_height +.loop: + LOAD_VERT_8 0 + APPLY_FILTER_8 1, 0 + sub rsi, rax + + LOAD_VERT_8 8 + APPLY_FILTER_8 1, 8 + add rdi, rbx + + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d4_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d4_h8_sse2) PRIVATE +sym(aom_filter_block1d4_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d8_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d8_h8_sse2) PRIVATE +sym(aom_filter_block1d8_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void aom_filter_block1d16_h8_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(aom_filter_block1d16_h8_sse2) PRIVATE +sym(aom_filter_block1d16_h8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 0, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h8_avg_sse2) PRIVATE +sym(aom_filter_block1d4_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 6 + %define k0k1 [rsp + 16 * 0] + %define k2k3 [rsp + 16 * 1] + %define k5k4 [rsp + 16 * 2] + %define k6k7 [rsp + 16 * 3] + %define krd [rsp + 16 * 4] + %define zero [rsp + 16 * 5] + + GET_FILTERS_4 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm3, xmm0 + movdqa xmm5, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm3, 3 + psrldq xmm5, 5 + psrldq xmm4, 4 + + APPLY_FILTER_4 1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 6 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h8_avg_sse2) PRIVATE +sym(aom_filter_block1d8_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h8_avg_sse2) PRIVATE +sym(aom_filter_block1d16_h8_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 * 10 + %define k0 [rsp + 16 * 0] + %define k1 [rsp + 16 * 1] + %define k2 [rsp + 16 * 2] + %define k3 [rsp + 16 * 3] + %define k4 [rsp + 16 * 4] + %define k5 [rsp + 16 * 5] + %define k6 [rsp + 16 * 6] + %define k7 [rsp + 16 * 7] + %define krd [rsp + 16 * 8] + %define zero [rsp + 16 * 9] + + GET_FILTERS + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height + +.loop: + movdqu xmm0, [rsi - 3] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 0 + + movdqu xmm0, [rsi + 5] ;load src + + movdqa xmm1, xmm0 + movdqa xmm6, xmm0 + movdqa xmm7, xmm0 + movdqa xmm2, xmm0 + movdqa xmm5, xmm0 + movdqa xmm3, xmm0 + movdqa xmm4, xmm0 + + psrldq xmm1, 1 + psrldq xmm6, 6 + psrldq xmm7, 7 + psrldq xmm2, 2 + psrldq xmm5, 5 + psrldq xmm3, 3 + psrldq xmm4, 4 + + APPLY_FILTER_8 1, 8 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx + jnz .loop + + add rsp, 16 * 10 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm new file mode 100644 index 000000000..357f37401 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm @@ -0,0 +1,883 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_64: times 8 dw 64 +even_byte_mask: times 8 dw 0x00ff + +; %define USE_PMULHRSW +; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss +; when using this instruction. +; +; The add order below (based on ffav1) must be followed to prevent outranges. +; x = k0k1 + k4k5 +; y = k2k3 + k6k7 +; z = signed SAT(x + y) + +SECTION .text +%define LOCAL_VARS_SIZE 16*6 + +%macro SETUP_LOCAL_VARS 0 + ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + + ; pmaddubsw has a higher latency on some platforms, this might be eased by + ; interleaving the instructions. + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + packsswb m4, m4 + ; TODO(slavarnway): multiple pshufb instructions had a higher latency on + ; some platforms. + pshuflw m0, m4, 0b ;k0_k1 + pshuflw m1, m4, 01010101b ;k2_k3 + pshuflw m2, m4, 10101010b ;k4_k5 + pshuflw m3, m4, 11111111b ;k6_k7 + punpcklqdq m0, m0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + mova k0k1, m0 + mova k2k3, m1 + mova k4k5, m2 + mova k6k7, m3 +%if ARCH_X86_64 + %define krd m12 + %define tmp0 [rsp + 16*4] + %define tmp1 [rsp + 16*5] + mova krd, [GLOBAL(pw_64)] +%else + %define krd [rsp + 16*4] +%if CONFIG_PIC=0 + mova m6, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m6, m6 ;all ones + psrlw m6, 15 + psllw m6, 6 ;aka pw_64 +%endif + mova krd, m6 +%endif +%endm + +;------------------------------------------------------------------------------- +%if ARCH_X86_64 + %define LOCAL_VARS_SIZE_H4 0 +%else + %define LOCAL_VARS_SIZE_H4 16*4 +%endif + +%macro SUBPIX_HFILTER4 1 +cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + packsswb m4, m4 +%if ARCH_X86_64 + %define k0k1k4k5 m8 + %define k2k3k6k7 m9 + %define krd m10 + mova krd, [GLOBAL(pw_64)] + pshuflw k0k1k4k5, m4, 0b ;k0_k1 + pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 + pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 + pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 +%else + %define k0k1k4k5 [rsp + 16*0] + %define k2k3k6k7 [rsp + 16*1] + %define krd [rsp + 16*2] + pshuflw m6, m4, 0b ;k0_k1 + pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 + pshuflw m7, m4, 01010101b ;k2_k3 + pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 +%if CONFIG_PIC=0 + mova m1, [GLOBAL(pw_64)] +%else + ; build constants without accessing global memory + pcmpeqb m1, m1 ;all ones + psrlw m1, 15 + psllw m1, 6 ;aka pw_64 +%endif + mova k0k1k4k5, m6 + mova k2k3k6k7, m7 + mova krd, m1 +%endif + dec heightd + +.loop: + ;Do two rows at once + movu m4, [srcq - 3] + movu m5, [srcq + sstrideq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + punpckhbw m3, m5, m5 + punpcklbw m5, m5 + palignr m0, m1, m4, 1 + pmaddubsw m0, k0k1k4k5 + palignr m1, m4, 5 + pmaddubsw m1, k2k3k6k7 + palignr m2, m3, m5, 1 + pmaddubsw m2, k0k1k4k5 + palignr m3, m5, 5 + pmaddubsw m3, k2k3k6k7 + punpckhqdq m4, m0, m2 + punpcklqdq m0, m2 + punpckhqdq m5, m1, m3 + punpcklqdq m1, m3 + paddsw m0, m4 + paddsw m1, m5 +%ifidn %1, h8_avg + movd m4, [dstq] + movd m5, [dstq + dstrideq] +%endif + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 + punpcklbw m4, m3 + paddsw m0, m4 +%endif + packuswb m0, m0 + psrldq m1, m0, 4 + +%ifidn %1, h8_avg + pavgb m0, m4 + pavgb m1, m5 +%endif + movd [dstq], m0 + movd [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m4, [srcq - 3] + punpckhbw m1, m4, m4 + punpcklbw m4, m4 + palignr m0, m1, m4, 1 + palignr m1, m4, 5 + pmaddubsw m0, k0k1k4k5 + pmaddubsw m1, k2k3k6k7 + psrldq m2, m0, 8 + psrldq m3, m1, 8 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + punpcklbw m4, m3 + paddsw m0, m4 +%endif + packuswb m0, m0 +%ifidn %1, h8_avg + movd m4, [dstq] + pavgb m0, m4 +%endif + movd [dstq], m0 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER8 1 +cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + dec heightd + +.loop: + ;Do two rows at once + movu m0, [srcq - 3] + movu m4, [srcq + sstrideq - 3] + punpckhbw m1, m0, m0 + punpcklbw m0, m0 + palignr m5, m1, m0, 13 + pmaddubsw m5, k6k7 + palignr m2, m1, m0, 5 + palignr m3, m1, m0, 9 + palignr m1, m0, 1 + pmaddubsw m1, k0k1 + punpckhbw m6, m4, m4 + punpcklbw m4, m4 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + + palignr m7, m6, m4, 13 + palignr m0, m6, m4, 5 + pmaddubsw m7, k6k7 + paddsw m1, m3 + paddsw m2, m5 + paddsw m1, m2 +%ifidn %1, h8_avg + movh m2, [dstq] + movhps m2, [dstq + dstrideq] +%endif + palignr m5, m6, m4, 9 + palignr m6, m4, 1 + pmaddubsw m0, k2k3 + pmaddubsw m6, k0k1 + paddsw m1, krd + pmaddubsw m5, k4k5 + psraw m1, 7 + paddsw m0, m7 + paddsw m6, m5 + paddsw m6, m0 + paddsw m6, krd + psraw m6, 7 +%ifidn %1, h8_add_src + pxor m3, m3 + movu m4, [srcq] + movu m5, [srcq + sstrideq] + punpcklbw m4, m3 + punpcklbw m5, m3 + paddsw m1, m4 + paddsw m6, m5 +%endif + packuswb m1, m6 +%ifidn %1, h8_avg + pavgb m1, m2 +%endif + movh [dstq], m1 + movhps [dstq + dstrideq], m1 + + lea srcq, [srcq + sstrideq ] + prefetcht0 [srcq + 4 * sstrideq - 3] + lea srcq, [srcq + sstrideq ] + lea dstq, [dstq + 2 * dstrideq ] + prefetcht0 [srcq + 2 * sstrideq - 3] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m0, [srcq - 3] + punpckhbw m3, m0, m0 + punpcklbw m0, m0 + palignr m1, m3, m0, 1 + palignr m2, m3, m0, 5 + palignr m4, m3, m0, 13 + palignr m3, m0, 9 + pmaddubsw m1, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m3, k4k5 + pmaddubsw m4, k6k7 + paddsw m1, m3 + paddsw m4, m2 + paddsw m1, m4 + paddsw m1, krd + psraw m1, 7 +%ifidn %1, h8_add_src + pxor m6, m6 + movu m5, [srcq] + punpcklbw m5, m6 + paddsw m1, m5 +%endif + packuswb m1, m1 +%ifidn %1, h8_avg + movh m0, [dstq] + pavgb m1, m0 +%endif + movh [dstq], m1 +.done: + REP_RET +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_HFILTER16 1 +cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +.loop: + prefetcht0 [srcq + 2 * sstrideq -3] + + movu m0, [srcq - 3] + movu m4, [srcq - 2] + pmaddubsw m0, k0k1 + pmaddubsw m4, k0k1 + movu m1, [srcq - 1] + movu m5, [srcq + 0] + pmaddubsw m1, k2k3 + pmaddubsw m5, k2k3 + movu m2, [srcq + 1] + movu m6, [srcq + 2] + pmaddubsw m2, k4k5 + pmaddubsw m6, k4k5 + movu m3, [srcq + 3] + movu m7, [srcq + 4] + pmaddubsw m3, k6k7 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m1, m3 + paddsw m0, m1 + paddsw m4, m6 + paddsw m5, m7 + paddsw m4, m5 + paddsw m0, krd + paddsw m4, krd + psraw m0, 7 + psraw m4, 7 +%ifidn %1, h8_add_src + movu m5, [srcq] + mova m7, m5 + pand m5, [even_byte_mask] + psrlw m7, 8 + paddsw m0, m5 + paddsw m4, m7 +%endif + packuswb m0, m0 + packuswb m4, m4 + punpcklbw m0, m4 +%ifidn %1, h8_avg + pavgb m0, [dstq] +%endif + lea srcq, [srcq + sstrideq] + mova [dstq], m0 + lea dstq, [dstq + dstrideq] + dec heightd + jnz .loop + REP_RET +%endm + +INIT_XMM ssse3 +SUBPIX_HFILTER16 h8 +SUBPIX_HFILTER16 h8_avg +SUBPIX_HFILTER8 h8 +SUBPIX_HFILTER8 h8_avg +SUBPIX_HFILTER4 h8 +SUBPIX_HFILTER4 h8_avg + +%if CONFIG_LOOP_RESTORATION +SUBPIX_HFILTER16 h8_add_src +SUBPIX_HFILTER8 h8_add_src +SUBPIX_HFILTER4 h8_add_src +%endif + +;------------------------------------------------------------------------------- + +; TODO(Linfeng): Detect cpu type and choose the code with better performance. +%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 + +%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + %define NUM_GENERAL_REG_USED 9 +%else + %define NUM_GENERAL_REG_USED 6 +%endif + +%macro SUBPIX_VFILTER 2 +cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%ifidn %2, 8 + %define movx movh +%else + %define movx movd +%endif + + dec heightd + +%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + mov src1q, srcq + add src1q, sstrideq + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + ;Do two rows at once + movx m0, [srcq ] ;A + movx m1, [src1q ] ;B + punpcklbw m0, m1 ;A B + movx m2, [srcq + sstrideq * 2 ] ;C + pmaddubsw m0, k0k1 + mova m6, m2 + movx m3, [src1q + sstrideq * 2] ;D + punpcklbw m2, m3 ;C D + pmaddubsw m2, k2k3 + movx m4, [srcq + sstrideq * 4 ] ;E + mova m7, m4 + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m4, k4k5 + punpcklbw m1, m6 ;A B next iter + movx m6, [srcq + sstride6q ] ;G + punpcklbw m5, m6 ;E F next iter + punpcklbw m3, m7 ;C D next iter + pmaddubsw m5, k4k5 + movx m7, [src1q + sstride6q ] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m6, k6k7 + pmaddubsw m3, k2k3 + pmaddubsw m1, k0k1 + paddsw m0, m4 + paddsw m2, m6 + movx m6, [srcq + sstrideq * 8 ] ;H next iter + punpcklbw m7, m6 + pmaddubsw m7, k6k7 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + paddsw m1, m5 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif + packuswb m0, m0 + + paddsw m3, m7 + paddsw m1, m3 + paddsw m1, krd + psraw m1, 7 +%ifidn %1, v8_add_src + movu m4, [src1q] + punpcklbw m4, m6 + paddsw m1, m4 +%endif + lea srcq, [srcq + sstrideq * 2 ] + lea src1q, [src1q + sstrideq * 2] + packuswb m1, m1 + +%ifidn %1, v8_avg + movx m2, [dstq] + pavgb m0, m2 +%endif + movx [dstq], m0 + add dstq, dst_stride +%ifidn %1, v8_avg + movx m3, [dstq] + pavgb m1, m3 +%endif + movx [dstq], m1 + add dstq, dst_stride + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + movx m6, [srcq + sstride6q ] ;G + punpcklbw m0, m1 ;A B + movx m7, [src1q + sstride6q ] ;H + pmaddubsw m0, k0k1 + movx m2, [srcq + sstrideq * 2 ] ;C + punpcklbw m6, m7 ;G H + movx m3, [src1q + sstrideq * 2] ;D + pmaddubsw m6, k6k7 + movx m4, [srcq + sstrideq * 4 ] ;E + punpcklbw m2, m3 ;C D + movx m5, [src1q + sstrideq * 4] ;F + punpcklbw m4, m5 ;E F + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + paddsw m2, m6 + paddsw m0, m4 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [srcq] + punpcklbw m4, m6 + paddsw m0, m4 +%endif + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%else + ; ARCH_X86_64 + + movx m0, [srcq ] ;A + movx m1, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2 ] + movx m2, [srcq] ;C + movx m3, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2 ] + movx m4, [srcq] ;E + movx m5, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2 ] + movx m6, [srcq] ;G + punpcklbw m0, m1 ;A B + punpcklbw m1, m2 ;A B next iter + punpcklbw m2, m3 ;C D + punpcklbw m3, m4 ;C D next iter + punpcklbw m4, m5 ;E F + punpcklbw m5, m6 ;E F next iter + +.loop: + ;Do two rows at once + movx m7, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2 ] + movx m14, [srcq] ;H next iter + punpcklbw m6, m7 ;G H + punpcklbw m7, m14 ;G H next iter + pmaddubsw m8, m0, k0k1 + pmaddubsw m9, m1, k0k1 + mova m0, m2 + mova m1, m3 + pmaddubsw m10, m2, k2k3 + pmaddubsw m11, m3, k2k3 + mova m2, m4 + mova m3, m5 + pmaddubsw m4, k4k5 + pmaddubsw m5, k4k5 + paddsw m8, m4 + paddsw m9, m5 + mova m4, m6 + mova m5, m7 + pmaddubsw m6, k6k7 + pmaddubsw m7, k6k7 + paddsw m10, m6 + paddsw m11, m7 + paddsw m8, m10 + paddsw m9, m11 + mova m6, m14 + paddsw m8, krd + paddsw m9, krd + psraw m8, 7 + psraw m9, 7 +%ifidn %2, 4 + packuswb m8, m8 + packuswb m9, m9 +%else + packuswb m8, m9 +%endif + +%ifidn %1, v8_avg + movx m7, [dstq] +%ifidn %2, 4 + movx m10, [dstq + dstrideq] + pavgb m9, m10 +%else + movhpd m7, [dstq + dstrideq] +%endif + pavgb m8, m7 +%endif + movx [dstq], m8 +%ifidn %2, 4 + movx [dstq + dstrideq], m9 +%else + movhpd [dstq + dstrideq], m8 +%endif + + lea dstq, [dstq + dstrideq * 2 ] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movx m7, [srcq + sstrideq] ;H + punpcklbw m6, m7 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m2, k2k3 + pmaddubsw m4, k4k5 + pmaddubsw m6, k6k7 + paddsw m0, m4 + paddsw m2, m6 + paddsw m0, m2 + paddsw m0, krd + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, v8_avg + movx m1, [dstq] + pavgb m0, m1 +%endif + movx [dstq], m0 + +%endif ; ARCH_X86_64 + +.done: + REP_RET + +%endm + +;------------------------------------------------------------------------------- +%macro SUBPIX_VFILTER16 1 +cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ + src, sstride, dst, dstride, height, filter + mova m4, [filterq] + SETUP_LOCAL_VARS + +%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON + +%if ARCH_X86_64 + %define src1q r7 + %define sstride6q r8 + %define dst_stride dstrideq +%else + %define src1q filterq + %define sstride6q dstrideq + %define dst_stride dstridemp +%endif + lea src1q, [srcq + sstrideq] + lea sstride6q, [sstrideq + sstrideq * 4] + add sstride6q, sstrideq ;pitch * 6 + +.loop: + movh m0, [srcq ] ;A + movh m1, [src1q ] ;B + movh m2, [srcq + sstrideq * 2 ] ;C + movh m3, [src1q + sstrideq * 2] ;D + movh m4, [srcq + sstrideq * 4 ] ;E + movh m5, [src1q + sstrideq * 4] ;F + + punpcklbw m0, m1 ;A B + movh m6, [srcq + sstride6q] ;G + punpcklbw m2, m3 ;C D + movh m7, [src1q + sstride6q] ;H + punpcklbw m4, m5 ;E F + pmaddubsw m0, k0k1 + movh m3, [srcq + 8] ;A + pmaddubsw m2, k2k3 + punpcklbw m6, m7 ;G H + movh m5, [srcq + sstrideq + 8] ;B + pmaddubsw m4, k4k5 + punpcklbw m3, m5 ;A B + movh m7, [srcq + sstrideq * 2 + 8] ;C + pmaddubsw m6, k6k7 + movh m5, [src1q + sstrideq * 2 + 8] ;D + punpcklbw m7, m5 ;C D + paddsw m2, m6 + pmaddubsw m3, k0k1 + movh m1, [srcq + sstrideq * 4 + 8] ;E + paddsw m0, m4 + pmaddubsw m7, k2k3 + movh m6, [src1q + sstrideq * 4 + 8] ;F + punpcklbw m1, m6 ;E F + paddsw m0, m2 + paddsw m0, krd + movh m2, [srcq + sstride6q + 8] ;G + pmaddubsw m1, k4k5 + movh m5, [src1q + sstride6q + 8] ;H + psraw m0, 7 + punpcklbw m2, m5 ;G H + pmaddubsw m2, k6k7 + paddsw m7, m2 + paddsw m3, m1 + paddsw m3, m7 + paddsw m3, krd + psraw m3, 7 +%ifidn %1, v8_add_src + pxor m6, m6 + movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down + mova m5, m4 + punpcklbw m4, m6 + punpckhbw m5, m6 + paddsw m0, m4 + paddsw m3, m5 +%endif + packuswb m0, m3 + + add srcq, sstrideq + add src1q, sstrideq +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + add dstq, dst_stride + dec heightd + jnz .loop + REP_RET + +%else + ; ARCH_X86_64 + dec heightd + + movu m1, [srcq ] ;A + movu m3, [srcq + sstrideq ] ;B + lea srcq, [srcq + sstrideq * 2] + punpcklbw m0, m1, m3 ;A B + punpckhbw m1, m3 ;A B + movu m5, [srcq] ;C + punpcklbw m2, m3, m5 ;A B next iter + punpckhbw m3, m5 ;A B next iter + mova tmp0, m2 ;store to stack + mova tmp1, m3 ;store to stack + movu m7, [srcq + sstrideq] ;D + lea srcq, [srcq + sstrideq * 2] + punpcklbw m4, m5, m7 ;C D + punpckhbw m5, m7 ;C D + movu m9, [srcq] ;E + punpcklbw m6, m7, m9 ;C D next iter + punpckhbw m7, m9 ;C D next iter + movu m11, [srcq + sstrideq] ;F + lea srcq, [srcq + sstrideq * 2] + punpcklbw m8, m9, m11 ;E F + punpckhbw m9, m11 ;E F + movu m2, [srcq] ;G + punpcklbw m10, m11, m2 ;E F next iter + punpckhbw m11, m2 ;E F next iter + +.loop: + ;Do two rows at once + pmaddubsw m13, m0, k0k1 + mova m0, m4 + pmaddubsw m14, m8, k4k5 + pmaddubsw m15, m4, k2k3 + mova m4, m8 + paddsw m13, m14 + movu m3, [srcq + sstrideq] ;H + lea srcq, [srcq + sstrideq * 2] + punpcklbw m14, m2, m3 ;G H + mova m8, m14 + pmaddubsw m14, k6k7 + paddsw m15, m14 + paddsw m13, m15 + paddsw m13, krd + psraw m13, 7 + + pmaddubsw m14, m1, k0k1 + pmaddubsw m1, m9, k4k5 + pmaddubsw m15, m5, k2k3 + paddsw m14, m1 + mova m1, m5 + mova m5, m9 + punpckhbw m2, m3 ;G H + mova m9, m2 + pmaddubsw m2, k6k7 + paddsw m15, m2 + paddsw m14, m15 + paddsw m14, krd + psraw m14, 7 + packuswb m13, m14 +%ifidn %1, v8_avg + pavgb m13, [dstq] +%endif + mova [dstq], m13 + + ; next iter + pmaddubsw m15, tmp0, k0k1 + pmaddubsw m14, m10, k4k5 + pmaddubsw m13, m6, k2k3 + paddsw m15, m14 + mova tmp0, m6 + mova m6, m10 + movu m2, [srcq] ;G next iter + punpcklbw m14, m3, m2 ;G H next iter + mova m10, m14 + pmaddubsw m14, k6k7 + paddsw m13, m14 + paddsw m15, m13 + paddsw m15, krd + psraw m15, 7 + + pmaddubsw m14, tmp1, k0k1 + mova tmp1, m7 + pmaddubsw m13, m7, k2k3 + mova m7, m11 + pmaddubsw m11, k4k5 + paddsw m14, m11 + punpckhbw m3, m2 ;G H next iter + mova m11, m3 + pmaddubsw m3, k6k7 + paddsw m13, m3 + paddsw m14, m13 + paddsw m14, krd + psraw m14, 7 + packuswb m15, m14 +%ifidn %1, v8_avg + pavgb m15, [dstq + dstrideq] +%endif + mova [dstq + dstrideq], m15 + lea dstq, [dstq + dstrideq * 2] + sub heightd, 2 + jg .loop + + ; Do last row if output_height is odd + jne .done + + movu m3, [srcq + sstrideq] ;H + punpcklbw m6, m2, m3 ;G H + punpckhbw m2, m3 ;G H + pmaddubsw m0, k0k1 + pmaddubsw m1, k0k1 + pmaddubsw m4, k2k3 + pmaddubsw m5, k2k3 + pmaddubsw m8, k4k5 + pmaddubsw m9, k4k5 + pmaddubsw m6, k6k7 + pmaddubsw m2, k6k7 + paddsw m0, m8 + paddsw m1, m9 + paddsw m4, m6 + paddsw m5, m2 + paddsw m0, m4 + paddsw m1, m5 + paddsw m0, krd + paddsw m1, krd + psraw m0, 7 + psraw m1, 7 + packuswb m0, m1 +%ifidn %1, v8_avg + pavgb m0, [dstq] +%endif + mova [dstq], m0 + +.done: + REP_RET + +%endif ; ARCH_X86_64 + +%endm + +INIT_XMM ssse3 +SUBPIX_VFILTER16 v8 +SUBPIX_VFILTER16 v8_avg +SUBPIX_VFILTER v8, 8 +SUBPIX_VFILTER v8_avg, 8 +SUBPIX_VFILTER v8, 4 +SUBPIX_VFILTER v8_avg, 4 + +%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \ + CONFIG_LOOP_RESTORATION +SUBPIX_VFILTER16 v8_add_src +SUBPIX_VFILTER v8_add_src, 8 +SUBPIX_VFILTER v8_add_src, 4 +%endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm new file mode 100644 index 000000000..8f025a8be --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm @@ -0,0 +1,451 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm3, [rdx] ;load filters + pshuflw xmm4, xmm3, 11111111b ;k3 + psrldq xmm3, 8 + pshuflw xmm3, xmm3, 0b ;k4 + punpcklqdq xmm4, xmm3 ;k3k4 + + movq xmm3, rcx ;rounding + pshufd xmm3, xmm3, 0 + + pxor xmm2, xmm2 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + + punpckldq xmm0, xmm1 ;two row in one register + punpcklbw xmm0, xmm2 ;unpack to word + pmullw xmm0, xmm4 ;multiply the filter factors + + movdqa xmm1, xmm0 + psrldq xmm1, 8 + paddsw xmm0, xmm1 + + paddsw xmm0, xmm3 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm7, [rdx] ;load filters + + pshuflw xmm6, xmm7, 11111111b ;k3 + pshufhw xmm7, xmm7, 0b ;k4 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + movq xmm4, rcx ;rounding + pshufd xmm4, xmm4, 0 + + pxor xmm5, xmm5 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 ;rounding + psraw xmm0, 7 ;shift + packuswb xmm0, xmm0 ;pack back to byte +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + punpckhbw xmm3, xmm5 + + pmullw xmm0, xmm6 + pmullw xmm1, xmm7 + pmullw xmm2, xmm6 + pmullw xmm3, xmm7 + + paddsw xmm0, xmm1 + paddsw xmm2, xmm3 + + paddsw xmm0, xmm4 ;rounding + paddsw xmm2, xmm4 + psraw xmm0, 7 ;shift + psraw xmm2, 7 + packuswb xmm0, xmm2 ;pack back to byte +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(aom_filter_block1d4_v2_sse2) PRIVATE +sym(aom_filter_block1d4_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_sse2) PRIVATE +sym(aom_filter_block1d8_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_sse2) PRIVATE +sym(aom_filter_block1d16_v2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_v2_avg_sse2) PRIVATE +sym(aom_filter_block1d4_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_avg_sse2) PRIVATE +sym(aom_filter_block1d8_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_avg_sse2) PRIVATE +sym(aom_filter_block1d16_v2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_sse2) PRIVATE +sym(aom_filter_block1d4_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_sse2) PRIVATE +sym(aom_filter_block1d8_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_sse2) PRIVATE +sym(aom_filter_block1d16_h2_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_avg_sse2) PRIVATE +sym(aom_filter_block1d4_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_avg_sse2) PRIVATE +sym(aom_filter_block1d8_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_avg_sse2) PRIVATE +sym(aom_filter_block1d16_h2_avg_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm new file mode 100644 index 000000000..b9b2da0be --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm @@ -0,0 +1,421 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro GET_PARAM_4 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm3, [rdx] ;load filters + psrldq xmm3, 6 + packsswb xmm3, xmm3 + pshuflw xmm3, xmm3, 0b ;k3_k4 + + movd xmm2, ecx ;rounding_shift + pshufd xmm2, xmm2, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_4 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm3 + + pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack to byte + +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro GET_PARAM 0 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov ecx, 0x01000100 + + movdqa xmm7, [rdx] ;load filters + psrldq xmm7, 6 + packsswb xmm7, xmm7 + pshuflw xmm7, xmm7, 0b ;k3_k4 + punpcklwd xmm7, xmm7 + + movd xmm6, ecx ;rounding_shift + pshufd xmm6, xmm6, 0 + + movsxd rax, DWORD PTR arg(1) ;pixels_per_line + movsxd rdx, DWORD PTR arg(3) ;out_pitch + movsxd rcx, DWORD PTR arg(4) ;output_height +%endm + +%macro APPLY_FILTER_8 1 + punpcklbw xmm0, xmm1 + pmaddubsw xmm0, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + packuswb xmm0, xmm0 ;pack back to byte + +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +%macro APPLY_FILTER_16 1 + punpcklbw xmm0, xmm1 + punpckhbw xmm2, xmm1 + pmaddubsw xmm0, xmm7 + pmaddubsw xmm2, xmm7 + + pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7) + pmulhrsw xmm2, xmm6 + packuswb xmm0, xmm2 ;pack back to byte + +%if %1 + movdqu xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movdqu [rdi], xmm0 ;store the result + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + dec rcx +%endm + +global sym(aom_filter_block1d4_v2_ssse3) PRIVATE +sym(aom_filter_block1d4_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_ssse3) PRIVATE +sym(aom_filter_block1d8_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_ssse3) PRIVATE +sym(aom_filter_block1d16_v2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_v2_avg_ssse3) PRIVATE +sym(aom_filter_block1d4_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movd xmm0, [rsi] ;load src + movd xmm1, [rsi + rax] + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_v2_avg_ssse3) PRIVATE +sym(aom_filter_block1d8_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movq xmm0, [rsi] ;0 + movq xmm1, [rsi + rax] ;1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_v2_avg_ssse3) PRIVATE +sym(aom_filter_block1d16_v2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;0 + movdqu xmm1, [rsi + rax] ;1 + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_ssse3) PRIVATE +sym(aom_filter_block1d4_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_ssse3) PRIVATE +sym(aom_filter_block1d8_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_ssse3) PRIVATE +sym(aom_filter_block1d16_h2_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 0 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d4_h2_avg_ssse3) PRIVATE +sym(aom_filter_block1d4_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + + GET_PARAM_4 +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_4 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d8_h2_avg_ssse3) PRIVATE +sym(aom_filter_block1d8_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqa xmm1, xmm0 + psrldq xmm1, 1 + + APPLY_FILTER_8 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(aom_filter_block1d16_h2_avg_ssse3) PRIVATE +sym(aom_filter_block1d16_h2_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + ; end prolog + + GET_PARAM +.loop: + movdqu xmm0, [rsi] ;load src + movdqu xmm1, [rsi + 1] + movdqa xmm2, xmm0 + + APPLY_FILTER_16 1 + jnz .loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c new file mode 100644 index 000000000..bcdc20f63 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/x86/synonyms.h" + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; + u0 = _mm_setzero_si128(); + // Row 0 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff0 = _mm_max_epi16(diff, negdiff); + // Row 1 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(absdiff0, absdiff); + minabsdiff = _mm_min_epi16(absdiff0, absdiff); + // Row 2 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 3 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 4 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 5 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 6 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 7 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); + *max = _mm_extract_epi16(maxabsdiff, 0); + + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); + *min = _mm_extract_epi16(minabsdiff, 0); +} + +unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 32) >> 6; +} + +unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(xx_loadl_32(s), u0); + s1 = _mm_unpacklo_epi8(xx_loadl_32(s + p), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 2 * p), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 3 * p), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 8) >> 4; +} + +static void hadamard_col8_sse2(__m128i *in, int iter) { + __m128i a0 = in[0]; + __m128i a1 = in[1]; + __m128i a2 = in[2]; + __m128i a3 = in[3]; + __m128i a4 = in[4]; + __m128i a5 = in[5]; + __m128i a6 = in[6]; + __m128i a7 = in[7]; + + __m128i b0 = _mm_add_epi16(a0, a1); + __m128i b1 = _mm_sub_epi16(a0, a1); + __m128i b2 = _mm_add_epi16(a2, a3); + __m128i b3 = _mm_sub_epi16(a2, a3); + __m128i b4 = _mm_add_epi16(a4, a5); + __m128i b5 = _mm_sub_epi16(a4, a5); + __m128i b6 = _mm_add_epi16(a6, a7); + __m128i b7 = _mm_sub_epi16(a6, a7); + + a0 = _mm_add_epi16(b0, b2); + a1 = _mm_add_epi16(b1, b3); + a2 = _mm_sub_epi16(b0, b2); + a3 = _mm_sub_epi16(b1, b3); + a4 = _mm_add_epi16(b4, b6); + a5 = _mm_add_epi16(b5, b7); + a6 = _mm_sub_epi16(b4, b6); + a7 = _mm_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm_add_epi16(a0, a4); + b7 = _mm_add_epi16(a1, a5); + b3 = _mm_add_epi16(a2, a6); + b4 = _mm_add_epi16(a3, a7); + b2 = _mm_sub_epi16(a0, a4); + b6 = _mm_sub_epi16(a1, a5); + b1 = _mm_sub_epi16(a2, a6); + b5 = _mm_sub_epi16(a3, a7); + + a0 = _mm_unpacklo_epi16(b0, b1); + a1 = _mm_unpacklo_epi16(b2, b3); + a2 = _mm_unpackhi_epi16(b0, b1); + a3 = _mm_unpackhi_epi16(b2, b3); + a4 = _mm_unpacklo_epi16(b4, b5); + a5 = _mm_unpacklo_epi16(b6, b7); + a6 = _mm_unpackhi_epi16(b4, b5); + a7 = _mm_unpackhi_epi16(b6, b7); + + b0 = _mm_unpacklo_epi32(a0, a1); + b1 = _mm_unpacklo_epi32(a4, a5); + b2 = _mm_unpackhi_epi32(a0, a1); + b3 = _mm_unpackhi_epi32(a4, a5); + b4 = _mm_unpacklo_epi32(a2, a3); + b5 = _mm_unpacklo_epi32(a6, a7); + b6 = _mm_unpackhi_epi32(a2, a3); + b7 = _mm_unpackhi_epi32(a6, a7); + + in[0] = _mm_unpacklo_epi64(b0, b1); + in[1] = _mm_unpackhi_epi64(b0, b1); + in[2] = _mm_unpacklo_epi64(b2, b3); + in[3] = _mm_unpackhi_epi64(b2, b3); + in[4] = _mm_unpacklo_epi64(b4, b5); + in[5] = _mm_unpackhi_epi64(b4, b5); + in[6] = _mm_unpacklo_epi64(b6, b7); + in[7] = _mm_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm_add_epi16(a0, a4); + in[7] = _mm_add_epi16(a1, a5); + in[3] = _mm_add_epi16(a2, a6); + in[4] = _mm_add_epi16(a3, a7); + in[2] = _mm_sub_epi16(a0, a4); + in[6] = _mm_sub_epi16(a1, a5); + in[1] = _mm_sub_epi16(a2, a6); + in[5] = _mm_sub_epi16(a3, a7); + } +} + +void aom_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, + int16_t *coeff) { + __m128i src[8]; + src[0] = _mm_load_si128((const __m128i *)src_diff); + src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + + hadamard_col8_sse2(src, 0); + hadamard_col8_sse2(src, 1); + + _mm_store_si128((__m128i *)coeff, src[0]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[1]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[2]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[3]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[4]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[5]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[6]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[7]); +} + +void aom_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, + int16_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + int16_t const *src_ptr = + src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; + aom_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + b0 = _mm_srai_epi16(b0, 1); + b1 = _mm_srai_epi16(b1, 1); + b2 = _mm_srai_epi16(b2, 1); + b3 = _mm_srai_epi16(b3, 1); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + _mm_store_si128((__m128i *)coeff, coeff0); + _mm_store_si128((__m128i *)(coeff + 64), coeff1); + + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + _mm_store_si128((__m128i *)(coeff + 128), coeff2); + _mm_store_si128((__m128i *)(coeff + 192), coeff3); + + coeff += 8; + } +} + +int aom_satd_sse2(const int16_t *coeff, int length) { + int i; + const __m128i zero = _mm_setzero_si128(); + __m128i accum = zero; + + for (i = 0; i < length; i += 8) { + const __m128i src_line = _mm_load_si128((const __m128i *)coeff); + const __m128i inv = _mm_sub_epi16(zero, src_line); + const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) + const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); + const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); + const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); + accum = _mm_add_epi32(accum, sum); + coeff += 8; + } + + { // cascading summation of accum + __m128i hi = _mm_srli_si128(accum, 8); + accum = _mm_add_epi32(accum, hi); + hi = _mm_srli_epi64(accum, 32); + accum = _mm_add_epi32(accum, hi); + } + + return _mm_cvtsi128_si32(accum); +} + +void aom_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, int ref_stride, + int height) { + int idx; + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_loadu_si128((const __m128i *)ref); + __m128i s0 = _mm_unpacklo_epi8(src_line, zero); + __m128i s1 = _mm_unpackhi_epi8(src_line, zero); + __m128i t0, t1; + int height_1 = height - 1; + ref += ref_stride; + + for (idx = 1; idx < height_1; idx += 2) { + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + ref += ref_stride; + } + + src_line = _mm_loadu_si128((const __m128i *)ref); + t0 = _mm_unpacklo_epi8(src_line, zero); + t1 = _mm_unpackhi_epi8(src_line, zero); + s0 = _mm_adds_epu16(s0, t0); + s1 = _mm_adds_epu16(s1, t1); + + if (height == 64) { + s0 = _mm_srai_epi16(s0, 5); + s1 = _mm_srai_epi16(s1, 5); + } else if (height == 32) { + s0 = _mm_srai_epi16(s0, 4); + s1 = _mm_srai_epi16(s1, 4); + } else { + s0 = _mm_srai_epi16(s0, 3); + s1 = _mm_srai_epi16(s1, 3); + } + + _mm_storeu_si128((__m128i *)hbuf, s0); + hbuf += 8; + _mm_storeu_si128((__m128i *)hbuf, s1); +} + +int16_t aom_int_pro_col_sse2(uint8_t const *ref, int width) { + __m128i zero = _mm_setzero_si128(); + __m128i src_line = _mm_load_si128((const __m128i *)ref); + __m128i s0 = _mm_sad_epu8(src_line, zero); + __m128i s1; + int i; + + for (i = 16; i < width; i += 16) { + ref += 16; + src_line = _mm_load_si128((const __m128i *)ref); + s1 = _mm_sad_epu8(src_line, zero); + s0 = _mm_adds_epu16(s0, s1); + } + + s1 = _mm_srli_si128(s0, 8); + s0 = _mm_adds_epu16(s0, s1); + + return _mm_extract_epi16(s0, 0); +} + +int aom_vector_var_sse2(int16_t const *ref, int16_t const *src, int bwl) { + int idx; + int width = 4 << bwl; + int16_t mean; + __m128i v0 = _mm_loadu_si128((const __m128i *)ref); + __m128i v1 = _mm_load_si128((const __m128i *)src); + __m128i diff = _mm_subs_epi16(v0, v1); + __m128i sum = diff; + __m128i sse = _mm_madd_epi16(diff, diff); + + ref += 8; + src += 8; + + for (idx = 8; idx < width; idx += 8) { + v0 = _mm_loadu_si128((const __m128i *)ref); + v1 = _mm_load_si128((const __m128i *)src); + diff = _mm_subs_epi16(v0, v1); + + sum = _mm_add_epi16(sum, diff); + v0 = _mm_madd_epi16(diff, diff); + sse = _mm_add_epi32(sse, v0); + + ref += 8; + src += 8; + } + + v0 = _mm_srli_si128(sum, 8); + sum = _mm_add_epi16(sum, v0); + v0 = _mm_srli_epi64(sum, 32); + sum = _mm_add_epi16(sum, v0); + v0 = _mm_srli_epi32(sum, 16); + sum = _mm_add_epi16(sum, v0); + + v1 = _mm_srli_si128(sse, 8); + sse = _mm_add_epi32(sse, v1); + v1 = _mm_srli_epi64(sse, 32); + sse = _mm_add_epi32(sse, v1); + + mean = _mm_extract_epi16(sum, 0); + + return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); +} diff --git a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm new file mode 100644 index 000000000..b2d150296 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm @@ -0,0 +1,124 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%define private_prefix aom + +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the hadamard transformation. Part +; of the macro definitions are originally derived from the ffmpeg project. +; The current version applies to x86 64-bit only. + +SECTION .text + +%if ARCH_X86_64 +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +%macro HMD8_1D 0 + psubw m8, m0, m1 + psubw m9, m2, m3 + paddw m0, m1 + paddw m2, m3 + SWAP 1, 8 + SWAP 3, 9 + psubw m8, m4, m5 + psubw m9, m6, m7 + paddw m4, m5 + paddw m6, m7 + SWAP 5, 8 + SWAP 7, 9 + + psubw m8, m0, m2 + psubw m9, m1, m3 + paddw m0, m2 + paddw m1, m3 + SWAP 2, 8 + SWAP 3, 9 + psubw m8, m4, m6 + psubw m9, m5, m7 + paddw m4, m6 + paddw m5, m7 + SWAP 6, 8 + SWAP 7, 9 + + psubw m8, m0, m4 + psubw m9, m1, m5 + paddw m0, m4 + paddw m1, m5 + SWAP 4, 8 + SWAP 5, 9 + psubw m8, m2, m6 + psubw m9, m3, m7 + paddw m2, m6 + paddw m3, m7 + SWAP 6, 8 + SWAP 7, 9 +%endmacro + +INIT_XMM ssse3 +cglobal hadamard_8x8, 3, 5, 10, input, stride, output + lea r3, [2 * strideq] + lea r4, [4 * strideq] + + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + HMD8_1D + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + HMD8_1D + + mova [outputq + 0], m0 + mova [outputq + 16], m1 + mova [outputq + 32], m2 + mova [outputq + 48], m3 + mova [outputq + 64], m4 + mova [outputq + 80], m5 + mova [outputq + 96], m6 + mova [outputq + 112], m7 + + RET +%endif diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c new file mode 100644 index 000000000..e916e4ff9 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom/aom_integer.h" + +#include "./aom_dsp_rtcd.h" + +// To start out, just dispatch to the function using the 2D mask and +// pass mask stride as 0. This can be improved upon if necessary. + +void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, 0, h, w, 0, 0); +} + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_blend_a64_hmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride, + src1_8, src1_stride, mask, 0, h, w, 0, 0, + bd); +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c new file mode 100644 index 000000000..68d74e517 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c @@ -0,0 +1,924 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/blend_sse4.h" + +#include "./aom_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, + int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_m0l_b = xx_loadl_64(mask + c); + const __m128i v_m0h_b = xx_loadl_64(mask + c + 8); + const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b); + const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sx_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_rl_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1)); + const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1)); + + const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b); + const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zero = _mm_setzero_si128(); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ra_b = xx_loadu_128(mask + c); + const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_mask_sx_sy_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_sy_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_sx_sy_w16n_sse4_1( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 16) { + const __m128i v_ral_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c); + const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16); + const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b); + const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b); + const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b); + const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b); + const __m128i v_rvsbl_w = + _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b); + const __m128i v_rvsbh_w = + _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b); + const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); + const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); + const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); + const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); + + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + int w, int suby, int subx) { + typedef void (*blend_fn)( + uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w); + + // Dimensions are: width_index X subx X suby + static const blend_fn blend[3][2][2] = { + { // w % 16 == 0 + { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 }, + { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 }, + { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } }, + { // w == 8 + { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 }, + { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } } + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, h, w, suby, subx); + } else { + blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0, + src0_stride, src1, src1_stride, + mask, mask_stride, h, w); + } +} + +#if CONFIG_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, blend_4_b10); +} + +static void blend_a64_mask_b12_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_m0_b = xx_loadl_64(mask + c); + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_a64_mask_b12_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sx_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_r_b = xx_loadu_128(mask + 2 * c); + const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); + + const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_a64_mask_b12_sx_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = xx_loadl_64(mask + c); + const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride); + const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_a64_mask_b12_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Horizontal and Vertical sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b10); +} + +static void blend_a64_mask_b12_sx_sy_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + (void)w; + blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, + blend_4_b12); +} + +static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + blend_unit_fn blend) { + const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, + 0xff, 0, 0xff, 0, 0xff, 0, 0xff); + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + for (c = 0; c < w; c += 8) { + const __m128i v_ra_b = xx_loadu_128(mask + 2 * c); + const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); + const __m128i v_rvsb_w = + _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); + const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b10); +} + +static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w) { + blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h, w, + blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, + const uint8_t *src0_8, + uint32_t src0_stride, + const uint8_t *src1_8, + uint32_t src1_stride, const uint8_t *mask, + uint32_t mask_stride, int h, int w, + int suby, int subx, int bd) { + typedef void (*blend_fn)( + uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w); + + // Dimensions are: bd_index X width_index X subx X suby + static const blend_fn blend[2][2][2][2] = { + { // bd == 8 or 10 + { // w % 8 == 0 + { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 }, + { blend_a64_mask_b10_sx_w8n_sse4_1, + blend_a64_mask_b10_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 }, + { blend_a64_mask_b10_sx_w4_sse4_1, + blend_a64_mask_b10_sx_sy_w4_sse4_1 } } }, + { // bd == 12 + { // w % 8 == 0 + { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 }, + { blend_a64_mask_b12_sx_w8n_sse4_1, + blend_a64_mask_b12_sx_sy_w8n_sse4_1 } }, + { // w == 4 + { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 }, + { blend_a64_mask_b12_sx_w4_sse4_1, + blend_a64_mask_b12_sx_sy_w4_sse4_1 } } } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, mask_stride, h, w, suby, + subx, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0]( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w); + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c new file mode 100644 index 000000000..9dabe5b79 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/blend_sse4.h" + +#include "./aom_dsp_rtcd.h" + +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + (void)w; + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, + uint32_t src0_stride, + const uint8_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 16) { + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w); + const __m128i v_resh_w = + blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w); + + const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + + xx_storeu_128(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w); + + // Dimension: width_index + static const blend_fn blend[9] = { + blend_a64_vmask_w16n_sse4_1, // w % 16 == 0 + aom_blend_a64_vmask_c, // w == 1 + aom_blend_a64_vmask_c, // w == 2 + NULL, // INVALID + blend_a64_vmask_w4_sse4_1, // w == 4 + NULL, // INVALID + NULL, // INVALID + NULL, // INVALID + blend_a64_vmask_w8_sse4_1, // w == 8 + }; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, + w); +} + +#if CONFIG_HIGHBITDEPTH +////////////////////////////////////////////////////////////////////////////// +// Implementation - No sub-sampling +////////////////////////////////////////////////////////////////////////////// + +static INLINE void blend_a64_vmask_bn_w4_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + + const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); + + xx_storel_64(dst, v_res_w); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, blend_4_b10); +} + +static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + (void)w; + blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, blend_4_b12); +} + +static INLINE void blend_a64_vmask_bn_w8n_sse4_1( + uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, + uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w, blend_unit_fn blend) { + const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + + do { + int c; + const __m128i v_m0_w = _mm_set1_epi16(*mask); + const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + for (c = 0; c < w; c += 8) { + const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); + + xx_storeu_128(dst + c, v_res_w); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 1; + } while (--h); +} + +static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, w, blend_8_b10); +} + +static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, + const uint16_t *src0, + uint32_t src0_stride, + const uint16_t *src1, + uint32_t src1_stride, + const uint8_t *mask, int h, int w) { + blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, w, blend_8_b12); +} + +////////////////////////////////////////////////////////////////////////////// +// Dispatch +////////////////////////////////////////////////////////////////////////////// + +void aom_highbd_blend_a64_vmask_sse4_1( + uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, + uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, + const uint8_t *mask, int h, int w, int bd) { + typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride, + const uint16_t *src0, uint32_t src0_stride, + const uint16_t *src1, uint32_t src1_stride, + const uint8_t *mask, int h, int w); + + // Dimensions are: bd_index X width_index + static const blend_fn blend[2][2] = { + { + // bd == 8 or 10 + blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b10_w4_sse4_1, // w == 4 + }, + { + // bd == 12 + blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 + blend_a64_vmask_b12_w4_sse4_1, // w == 4 + } + }; + + assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); + assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + assert(bd == 8 || bd == 10 || bd == 12); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, + src1_stride, mask, h, w, bd); + } else { + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); + const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); + const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); + + blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, h, w); + } +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h new file mode 100644 index 000000000..daa2b2b3a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_sse4.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_BLEND_SSE4_H_ +#define AOM_DSP_X86_BLEND_SSE4_H_ + +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/synonyms.h" + +////////////////////////////////////////////////////////////////////////////// +// Common kernels +////////////////////////////////////////////////////////////////////////////// + +static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); + const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +#if CONFIG_HIGHBITDEPTH +typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w); + +static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + + const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); + + const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); + + return v_res_w; +} + +static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadl_64(src0); + const __m128i v_s1_w = xx_loadl_64(src1); + + // Interleave + const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w); + + // Scale + const __m128i v_ssum_d = + _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} + +static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, + const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i v_s0_w = xx_loadu_128(src0); + const __m128i v_s1_w = xx_loadu_128(src1); + + // Interleave + const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w); + const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w); + const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w); + const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w); + + // Multiply-Add + const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w); + const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w); + + // Scale + const __m128i v_ssuml_d = + _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1); + const __m128i v_ssumh_d = + _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1); + + // Pack + const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d); + + // Round + const __m128i v_res_w = xx_round_epu16(v_pssum_d); + + return v_res_w; +} +#endif // CONFIG_HIGHBITDEPTH + +#endif // AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h new file mode 100644 index 000000000..8641164db --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve.h @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_DSP_X86_CONVOLVE_H_ +#define AOM_DSP_X86_CONVOLVE_H_ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_convolve.h" + +typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, + uint8_t *output_ptr, ptrdiff_t out_pitch, + uint32_t output_height, const int16_t *filter); + +#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void aom_convolve8_##name##_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + (void)filter_x; \ + (void)x_step_q4; \ + (void)filter_y; \ + (void)y_step_q4; \ + assert((-128 <= filter[3]) && (filter[3] <= 127)); \ + assert(step_q4 == 16); \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + if (w) { \ + aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h); \ + } \ + } + +#define FUN_CONV_2D(avg, opt) \ + void aom_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ + assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ + assert(w <= MAX_SB_SIZE); \ + assert(h <= MAX_SB_SIZE); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] || \ + filter_y[1] || filter_y[2]) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ + aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, \ + MAX_SB_SIZE, filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h + 7); \ + aom_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ + dst, dst_stride, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h); \ + } else { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ + aom_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, \ + h + 1); \ + aom_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, \ + y_step_q4, w, h); \ + } \ + } + +#if CONFIG_LOOP_RESTORATION +// convolve_add_src is only used by the Wiener filter, which will never +// end up calling the bilinear functions (it uses a symmetric filter, so +// the possible numbers of taps are 1,3,5,7) +#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \ + opt) \ + void aom_convolve8_##name##_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + (void)filter_x; \ + (void)x_step_q4; \ + (void)filter_y; \ + (void)y_step_q4; \ + assert((-128 <= filter[3]) && (filter[3] <= 127)); \ + assert(step_q4 == 16); \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + if (w) { \ + aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h); \ + } \ + } + +#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt) \ + void aom_convolve8_##type##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h) { \ + DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ + assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ + assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ + assert(w <= MAX_SB_SIZE); \ + assert(h <= MAX_SB_SIZE); \ + assert(x_step_q4 == 16); \ + assert(y_step_q4 == 16); \ + aom_convolve8_##htype##horiz_##opt( \ + src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x, \ + x_step_q4, filter_y, y_step_q4, w, h + 7); \ + aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ + dst, dst_stride, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h); \ + } +#endif + +#if CONFIG_HIGHBITDEPTH +typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, + const ptrdiff_t src_pitch, + uint16_t *output_ptr, + ptrdiff_t out_pitch, + unsigned int output_height, + const int16_t *filter, int bd); + +#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \ + void aom_highbd_convolve8_##name##_##opt( \ + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else { \ + while (w >= 16) { \ + aom_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } \ + } \ + if (w) { \ + aom_highbd_convolve8_##name##_c( \ + CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \ + dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } \ + } + +#define HIGH_FUN_CONV_2D(avg, opt) \ + void aom_highbd_convolve8_##avg##opt( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ + assert(w <= MAX_SB_SIZE); \ + assert(h <= MAX_SB_SIZE); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ + filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, \ + fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ + aom_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + CONVERT_TO_BYTEPTR(fdata2), \ + MAX_SB_SIZE, filter_x, x_step_q4, \ + filter_y, y_step_q4, w, h + 7, bd); \ + aom_highbd_convolve8_##avg##vert_##opt( \ + CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst, \ + dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, \ + fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ + aom_highbd_convolve8_horiz_##opt( \ + src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \ + aom_highbd_convolve8_##avg##vert_##opt( \ + CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ + } \ + } else { \ + aom_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ + filter_x, x_step_q4, filter_y, y_step_q4, \ + w, h, bd); \ + } \ + } +#endif // CONFIG_HIGHBITDEPTH + +#endif // AOM_DSP_X86_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c new file mode 100644 index 000000000..b8ec08de7 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c @@ -0,0 +1,862 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "aom_dsp/fwd_txfm.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +// Apply a 32-element IDCT to 8 columns. This does not do any transposition +// of its output - the caller is expected to do that. +// The input buffers are the top and bottom halves of an 8x32 block. +void fdct32_8col(__m128i *in0, __m128i *in1) { + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); + const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); + const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); + const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); + const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); + const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); + const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); + const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); + const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); + const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); + const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); + const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); + const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); + const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); + const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); + const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); + const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i step1[32]; + __m128i step2[32]; + __m128i step3[32]; + __m128i out[32]; + // Stage 1 + { + const __m128i *ina = in0; + const __m128i *inb = in1 + 15; + __m128i *step1a = &step1[0]; + __m128i *step1b = &step1[31]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + } + { + const __m128i *ina = in0 + 4; + const __m128i *inb = in1 + 11; + __m128i *step1a = &step1[4]; + __m128i *step1b = &step1[27]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + } + { + const __m128i *ina = in0 + 8; + const __m128i *inb = in1 + 7; + __m128i *step1a = &step1[8]; + __m128i *step1b = &step1[23]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + } + { + const __m128i *ina = in0 + 12; + const __m128i *inb = in1 + 3; + __m128i *step1a = &step1[12]; + __m128i *step1b = &step1[19]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + } + // Stage 2 + { + step2[0] = _mm_add_epi16(step1[0], step1[15]); + step2[1] = _mm_add_epi16(step1[1], step1[14]); + step2[2] = _mm_add_epi16(step1[2], step1[13]); + step2[3] = _mm_add_epi16(step1[3], step1[12]); + step2[4] = _mm_add_epi16(step1[4], step1[11]); + step2[5] = _mm_add_epi16(step1[5], step1[10]); + step2[6] = _mm_add_epi16(step1[6], step1[9]); + step2[7] = _mm_add_epi16(step1[7], step1[8]); + step2[8] = _mm_sub_epi16(step1[7], step1[8]); + step2[9] = _mm_sub_epi16(step1[6], step1[9]); + step2[10] = _mm_sub_epi16(step1[5], step1[10]); + step2[11] = _mm_sub_epi16(step1[4], step1[11]); + step2[12] = _mm_sub_epi16(step1[3], step1[12]); + step2[13] = _mm_sub_epi16(step1[2], step1[13]); + step2[14] = _mm_sub_epi16(step1[1], step1[14]); + step2[15] = _mm_sub_epi16(step1[0], step1[15]); + } + { + const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); + const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); + const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); + const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); + const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); + const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); + const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); + const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); + const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); + } + // Stage 3 + { + step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); + step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]); + step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]); + step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]); + step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]); + step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]); + step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]); + step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]); + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); + } + { + step3[16] = _mm_add_epi16(step2[23], step1[16]); + step3[17] = _mm_add_epi16(step2[22], step1[17]); + step3[18] = _mm_add_epi16(step2[21], step1[18]); + step3[19] = _mm_add_epi16(step2[20], step1[19]); + step3[20] = _mm_sub_epi16(step1[19], step2[20]); + step3[21] = _mm_sub_epi16(step1[18], step2[21]); + step3[22] = _mm_sub_epi16(step1[17], step2[22]); + step3[23] = _mm_sub_epi16(step1[16], step2[23]); + step3[24] = _mm_sub_epi16(step1[31], step2[24]); + step3[25] = _mm_sub_epi16(step1[30], step2[25]); + step3[26] = _mm_sub_epi16(step1[29], step2[26]); + step3[27] = _mm_sub_epi16(step1[28], step2[27]); + step3[28] = _mm_add_epi16(step2[27], step1[28]); + step3[29] = _mm_add_epi16(step2[26], step1[29]); + step3[30] = _mm_add_epi16(step2[25], step1[30]); + step3[31] = _mm_add_epi16(step2[24], step1[31]); + } + + // Stage 4 + { + step1[0] = _mm_add_epi16(step3[3], step3[0]); + step1[1] = _mm_add_epi16(step3[2], step3[1]); + step1[2] = _mm_sub_epi16(step3[1], step3[2]); + step1[3] = _mm_sub_epi16(step3[0], step3[3]); + step1[8] = _mm_add_epi16(step3[11], step2[8]); + step1[9] = _mm_add_epi16(step3[10], step2[9]); + step1[10] = _mm_sub_epi16(step2[9], step3[10]); + step1[11] = _mm_sub_epi16(step2[8], step3[11]); + step1[12] = _mm_sub_epi16(step2[15], step3[12]); + step1[13] = _mm_sub_epi16(step2[14], step3[13]); + step1[14] = _mm_add_epi16(step3[13], step2[14]); + step1[15] = _mm_add_epi16(step3[12], step2[15]); + } + { + const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); + const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); + const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); + } + { + const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); + const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); + const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); + const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); + const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); + const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); + const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); + const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); + const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); + } + // Stage 5 + { + step2[4] = _mm_add_epi16(step1[5], step3[4]); + step2[5] = _mm_sub_epi16(step3[4], step1[5]); + step2[6] = _mm_sub_epi16(step3[7], step1[6]); + step2[7] = _mm_add_epi16(step1[6], step3[7]); + } + { + const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); + const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); + const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); + const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); + const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[0] = _mm_packs_epi32(out_00_6, out_00_7); + out[16] = _mm_packs_epi32(out_16_6, out_16_7); + out[8] = _mm_packs_epi32(out_08_6, out_08_7); + out[24] = _mm_packs_epi32(out_24_6, out_24_7); + } + { + const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); + const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); + const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); + const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); + } + { + step2[16] = _mm_add_epi16(step1[19], step3[16]); + step2[17] = _mm_add_epi16(step1[18], step3[17]); + step2[18] = _mm_sub_epi16(step3[17], step1[18]); + step2[19] = _mm_sub_epi16(step3[16], step1[19]); + step2[20] = _mm_sub_epi16(step3[23], step1[20]); + step2[21] = _mm_sub_epi16(step3[22], step1[21]); + step2[22] = _mm_add_epi16(step1[21], step3[22]); + step2[23] = _mm_add_epi16(step1[20], step3[23]); + step2[24] = _mm_add_epi16(step1[27], step3[24]); + step2[25] = _mm_add_epi16(step1[26], step3[25]); + step2[26] = _mm_sub_epi16(step3[25], step1[26]); + step2[27] = _mm_sub_epi16(step3[24], step1[27]); + step2[28] = _mm_sub_epi16(step3[31], step1[28]); + step2[29] = _mm_sub_epi16(step3[30], step1[29]); + step2[30] = _mm_add_epi16(step1[29], step3[30]); + step2[31] = _mm_add_epi16(step1[28], step3[31]); + } + // Stage 6 + { + const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[4] = _mm_packs_epi32(out_04_6, out_04_7); + out[20] = _mm_packs_epi32(out_20_6, out_20_7); + out[12] = _mm_packs_epi32(out_12_6, out_12_7); + out[28] = _mm_packs_epi32(out_28_6, out_28_7); + } + { + step3[8] = _mm_add_epi16(step2[9], step1[8]); + step3[9] = _mm_sub_epi16(step1[8], step2[9]); + step3[10] = _mm_sub_epi16(step1[11], step2[10]); + step3[11] = _mm_add_epi16(step2[10], step1[11]); + step3[12] = _mm_add_epi16(step2[13], step1[12]); + step3[13] = _mm_sub_epi16(step1[12], step2[13]); + step3[14] = _mm_sub_epi16(step1[15], step2[14]); + step3[15] = _mm_add_epi16(step2[14], step1[15]); + } + { + const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); + const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); + const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); + const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); + const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); + const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); + const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); + const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); + const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); + } + // Stage 7 + { + const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); + const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); + const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); + const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); + const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); + const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); + const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); + const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); + const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[2] = _mm_packs_epi32(out_02_6, out_02_7); + out[18] = _mm_packs_epi32(out_18_6, out_18_7); + out[10] = _mm_packs_epi32(out_10_6, out_10_7); + out[26] = _mm_packs_epi32(out_26_6, out_26_7); + out[6] = _mm_packs_epi32(out_06_6, out_06_7); + out[22] = _mm_packs_epi32(out_22_6, out_22_7); + out[14] = _mm_packs_epi32(out_14_6, out_14_7); + out[30] = _mm_packs_epi32(out_30_6, out_30_7); + } + { + step1[16] = _mm_add_epi16(step3[17], step2[16]); + step1[17] = _mm_sub_epi16(step2[16], step3[17]); + step1[18] = _mm_sub_epi16(step2[19], step3[18]); + step1[19] = _mm_add_epi16(step3[18], step2[19]); + step1[20] = _mm_add_epi16(step3[21], step2[20]); + step1[21] = _mm_sub_epi16(step2[20], step3[21]); + step1[22] = _mm_sub_epi16(step2[23], step3[22]); + step1[23] = _mm_add_epi16(step3[22], step2[23]); + step1[24] = _mm_add_epi16(step3[25], step2[24]); + step1[25] = _mm_sub_epi16(step2[24], step3[25]); + step1[26] = _mm_sub_epi16(step2[27], step3[26]); + step1[27] = _mm_add_epi16(step3[26], step2[27]); + step1[28] = _mm_add_epi16(step3[29], step2[28]); + step1[29] = _mm_sub_epi16(step2[28], step3[29]); + step1[30] = _mm_sub_epi16(step2[31], step3[30]); + step1[31] = _mm_add_epi16(step3[30], step2[31]); + } + // Final stage --- outputs indices are bit-reversed. + { + const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); + const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); + const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); + const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); + const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); + const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); + const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); + const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); + const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[1] = _mm_packs_epi32(out_01_6, out_01_7); + out[17] = _mm_packs_epi32(out_17_6, out_17_7); + out[9] = _mm_packs_epi32(out_09_6, out_09_7); + out[25] = _mm_packs_epi32(out_25_6, out_25_7); + out[7] = _mm_packs_epi32(out_07_6, out_07_7); + out[23] = _mm_packs_epi32(out_23_6, out_23_7); + out[15] = _mm_packs_epi32(out_15_6, out_15_7); + out[31] = _mm_packs_epi32(out_31_6, out_31_7); + } + { + const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); + const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); + const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); + const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); + const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); + const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); + const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); + const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); + const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[5] = _mm_packs_epi32(out_05_6, out_05_7); + out[21] = _mm_packs_epi32(out_21_6, out_21_7); + out[13] = _mm_packs_epi32(out_13_6, out_13_7); + out[29] = _mm_packs_epi32(out_29_6, out_29_7); + out[3] = _mm_packs_epi32(out_03_6, out_03_7); + out[19] = _mm_packs_epi32(out_19_6, out_19_7); + out[11] = _mm_packs_epi32(out_11_6, out_11_7); + out[27] = _mm_packs_epi32(out_27_6, out_27_7); + } + + // Output results + { + int j; + for (j = 0; j < 16; ++j) { + _mm_storeu_si128((__m128i *)(in0 + j), out[j]); + _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]); + } + } +} // NOLINT diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h new file mode 100644 index 000000000..216739581 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h @@ -0,0 +1,3022 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 + +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/txfm_common_intrin.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +#if FDCT32x32_HIGH_PRECISION +static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) { + __m256i buf0, buf1; + buf0 = _mm256_mul_epu32(a, b); + a = _mm256_srli_epi64(a, 32); + b = _mm256_srli_epi64(b, 32); + buf1 = _mm256_mul_epu32(a, b); + return _mm256_add_epi64(buf0, buf1); +} + +static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) { + __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm256_unpacklo_epi64(buf0, buf1); +} +#endif + +#ifndef STORE_COEFF_FUNC +#define STORE_COEFF_FUNC +static void store_coeff(const __m256i *coeff, tran_low_t *curr, + tran_low_t *next) { + __m128i u = _mm256_castsi256_si128(*coeff); + storeu_output(&u, curr); + u = _mm256_extractf128_si256(*coeff, 1); + storeu_output(&u, next); +} +#endif + +void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org, + int stride) { + // Calculate pre-multiplied strides + const int str1 = stride; + const int str2 = 2 * stride; + const int str3 = 2 * stride + str1; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]); + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i k__cospi_p16_m16 = + pair256_set_epi16(+cospi_16_64, -cospi_16_64); + const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); + const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64); + const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); + const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); + const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); + const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); + const __m256i k__cospi_m12_m20 = + pair256_set_epi16(-cospi_12_64, -cospi_20_64); + const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64); + const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64); + const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64); + const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64); + const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); + const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); + const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); + const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); + const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64); + const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64); + const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64); + const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64); + const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); + const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); + const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); + const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); + const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64); + const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64); + const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64); + const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64); + const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); + const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); + const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); + const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); + const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); + const __m256i kZero = _mm256_set1_epi16(0); + const __m256i kOne = _mm256_set1_epi16(1); + // Do the two transform/transpose passes + int pass; + for (pass = 0; pass < 2; ++pass) { + // We process sixteen columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 32; column_start += 16) { + __m256i step1[32]; + __m256i step2[32]; + __m256i step3[32]; + __m256i out[32]; + // Stage 1 + // Note: even though all the loads below are aligned, using the aligned + // intrinsic make the code slightly slower. + if (0 == pass) { + const int16_t *in = &input[column_start]; + // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; + __m256i *step1a = &step1[0]; + __m256i *step1b = &step1[31]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; + __m256i *step1a = &step1[4]; + __m256i *step1b = &step1[27]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; + __m256i *step1a = &step1[8]; + __m256i *step1b = &step1[23]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; + __m256i *step1a = &step1[12]; + __m256i *step1b = &step1[19]; + const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); + const __m256i ina1 = + _mm256_loadu_si256((const __m256i *)(ina + str1)); + const __m256i ina2 = + _mm256_loadu_si256((const __m256i *)(ina + str2)); + const __m256i ina3 = + _mm256_loadu_si256((const __m256i *)(ina + str3)); + const __m256i inb3 = + _mm256_loadu_si256((const __m256i *)(inb - str3)); + const __m256i inb2 = + _mm256_loadu_si256((const __m256i *)(inb - str2)); + const __m256i inb1 = + _mm256_loadu_si256((const __m256i *)(inb - str1)); + const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); + step1a[0] = _mm256_add_epi16(ina0, inb0); + step1a[1] = _mm256_add_epi16(ina1, inb1); + step1a[2] = _mm256_add_epi16(ina2, inb2); + step1a[3] = _mm256_add_epi16(ina3, inb3); + step1b[-3] = _mm256_sub_epi16(ina3, inb3); + step1b[-2] = _mm256_sub_epi16(ina2, inb2); + step1b[-1] = _mm256_sub_epi16(ina1, inb1); + step1b[-0] = _mm256_sub_epi16(ina0, inb0); + step1a[0] = _mm256_slli_epi16(step1a[0], 2); + step1a[1] = _mm256_slli_epi16(step1a[1], 2); + step1a[2] = _mm256_slli_epi16(step1a[2], 2); + step1a[3] = _mm256_slli_epi16(step1a[3], 2); + step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); + } + } else { + int16_t *in = &intermediate[column_start]; + // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; + // Note: using the same approach as above to have common offset is + // counter-productive as all offsets can be calculated at compile + // time. + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32)); + __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32)); + __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32)); + __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32)); + __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32)); + __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32)); + __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32)); + __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32)); + step1[0] = _mm256_add_epi16(in00, in31); + step1[1] = _mm256_add_epi16(in01, in30); + step1[2] = _mm256_add_epi16(in02, in29); + step1[3] = _mm256_add_epi16(in03, in28); + step1[28] = _mm256_sub_epi16(in03, in28); + step1[29] = _mm256_sub_epi16(in02, in29); + step1[30] = _mm256_sub_epi16(in01, in30); + step1[31] = _mm256_sub_epi16(in00, in31); + } + { + __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32)); + __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32)); + __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32)); + __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32)); + __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32)); + __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32)); + __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32)); + __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32)); + step1[4] = _mm256_add_epi16(in04, in27); + step1[5] = _mm256_add_epi16(in05, in26); + step1[6] = _mm256_add_epi16(in06, in25); + step1[7] = _mm256_add_epi16(in07, in24); + step1[24] = _mm256_sub_epi16(in07, in24); + step1[25] = _mm256_sub_epi16(in06, in25); + step1[26] = _mm256_sub_epi16(in05, in26); + step1[27] = _mm256_sub_epi16(in04, in27); + } + { + __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32)); + __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32)); + __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32)); + __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32)); + __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32)); + __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32)); + __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32)); + __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32)); + step1[8] = _mm256_add_epi16(in08, in23); + step1[9] = _mm256_add_epi16(in09, in22); + step1[10] = _mm256_add_epi16(in10, in21); + step1[11] = _mm256_add_epi16(in11, in20); + step1[20] = _mm256_sub_epi16(in11, in20); + step1[21] = _mm256_sub_epi16(in10, in21); + step1[22] = _mm256_sub_epi16(in09, in22); + step1[23] = _mm256_sub_epi16(in08, in23); + } + { + __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32)); + __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32)); + __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32)); + __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32)); + __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32)); + __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32)); + __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32)); + __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32)); + step1[12] = _mm256_add_epi16(in12, in19); + step1[13] = _mm256_add_epi16(in13, in18); + step1[14] = _mm256_add_epi16(in14, in17); + step1[15] = _mm256_add_epi16(in15, in16); + step1[16] = _mm256_sub_epi16(in15, in16); + step1[17] = _mm256_sub_epi16(in14, in17); + step1[18] = _mm256_sub_epi16(in13, in18); + step1[19] = _mm256_sub_epi16(in12, in19); + } + } + // Stage 2 + { + step2[0] = _mm256_add_epi16(step1[0], step1[15]); + step2[1] = _mm256_add_epi16(step1[1], step1[14]); + step2[2] = _mm256_add_epi16(step1[2], step1[13]); + step2[3] = _mm256_add_epi16(step1[3], step1[12]); + step2[4] = _mm256_add_epi16(step1[4], step1[11]); + step2[5] = _mm256_add_epi16(step1[5], step1[10]); + step2[6] = _mm256_add_epi16(step1[6], step1[9]); + step2[7] = _mm256_add_epi16(step1[7], step1[8]); + step2[8] = _mm256_sub_epi16(step1[7], step1[8]); + step2[9] = _mm256_sub_epi16(step1[6], step1[9]); + step2[10] = _mm256_sub_epi16(step1[5], step1[10]); + step2[11] = _mm256_sub_epi16(step1[4], step1[11]); + step2[12] = _mm256_sub_epi16(step1[3], step1[12]); + step2[13] = _mm256_sub_epi16(step1[2], step1[13]); + step2[14] = _mm256_sub_epi16(step1[1], step1[14]); + step2[15] = _mm256_sub_epi16(step1[0], step1[15]); + } + { + const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]); + const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]); + const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]); + const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]); + const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]); + const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]); + const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]); + const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]); + const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s2_20_4 = + _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m256i s2_20_5 = + _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m256i s2_21_4 = + _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m256i s2_21_5 = + _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m256i s2_22_4 = + _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m256i s2_22_5 = + _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m256i s2_23_4 = + _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m256i s2_23_5 = + _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m256i s2_24_4 = + _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m256i s2_24_5 = + _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m256i s2_25_4 = + _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m256i s2_25_5 = + _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m256i s2_26_4 = + _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m256i s2_26_5 = + _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m256i s2_27_4 = + _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m256i s2_27_5 = + _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7); + } + +#if !FDCT32x32_HIGH_PRECISION + // dump the magnitude by half, hence the intermediate values are within + // the range of 16 bits. + if (1 == pass) { + __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]); + __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]); + __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]); + __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]); + __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]); + __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]); + __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]); + __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]); + __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]); + __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]); + __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]); + __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]); + __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]); + __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]); + __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]); + __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]); + __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]); + __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]); + __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]); + __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]); + __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]); + __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]); + __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]); + __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]); + __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]); + __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]); + __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]); + __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]); + __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]); + __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]); + __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]); + __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]); + + step2[0] = _mm256_sub_epi16(step2[0], s3_00_0); + step2[1] = _mm256_sub_epi16(step2[1], s3_01_0); + step2[2] = _mm256_sub_epi16(step2[2], s3_02_0); + step2[3] = _mm256_sub_epi16(step2[3], s3_03_0); + step2[4] = _mm256_sub_epi16(step2[4], s3_04_0); + step2[5] = _mm256_sub_epi16(step2[5], s3_05_0); + step2[6] = _mm256_sub_epi16(step2[6], s3_06_0); + step2[7] = _mm256_sub_epi16(step2[7], s3_07_0); + step2[8] = _mm256_sub_epi16(step2[8], s2_08_0); + step2[9] = _mm256_sub_epi16(step2[9], s2_09_0); + step2[10] = _mm256_sub_epi16(step2[10], s3_10_0); + step2[11] = _mm256_sub_epi16(step2[11], s3_11_0); + step2[12] = _mm256_sub_epi16(step2[12], s3_12_0); + step2[13] = _mm256_sub_epi16(step2[13], s3_13_0); + step2[14] = _mm256_sub_epi16(step2[14], s2_14_0); + step2[15] = _mm256_sub_epi16(step2[15], s2_15_0); + step1[16] = _mm256_sub_epi16(step1[16], s3_16_0); + step1[17] = _mm256_sub_epi16(step1[17], s3_17_0); + step1[18] = _mm256_sub_epi16(step1[18], s3_18_0); + step1[19] = _mm256_sub_epi16(step1[19], s3_19_0); + step2[20] = _mm256_sub_epi16(step2[20], s3_20_0); + step2[21] = _mm256_sub_epi16(step2[21], s3_21_0); + step2[22] = _mm256_sub_epi16(step2[22], s3_22_0); + step2[23] = _mm256_sub_epi16(step2[23], s3_23_0); + step2[24] = _mm256_sub_epi16(step2[24], s3_24_0); + step2[25] = _mm256_sub_epi16(step2[25], s3_25_0); + step2[26] = _mm256_sub_epi16(step2[26], s3_26_0); + step2[27] = _mm256_sub_epi16(step2[27], s3_27_0); + step1[28] = _mm256_sub_epi16(step1[28], s3_28_0); + step1[29] = _mm256_sub_epi16(step1[29], s3_29_0); + step1[30] = _mm256_sub_epi16(step1[30], s3_30_0); + step1[31] = _mm256_sub_epi16(step1[31], s3_31_0); + + step2[0] = _mm256_add_epi16(step2[0], kOne); + step2[1] = _mm256_add_epi16(step2[1], kOne); + step2[2] = _mm256_add_epi16(step2[2], kOne); + step2[3] = _mm256_add_epi16(step2[3], kOne); + step2[4] = _mm256_add_epi16(step2[4], kOne); + step2[5] = _mm256_add_epi16(step2[5], kOne); + step2[6] = _mm256_add_epi16(step2[6], kOne); + step2[7] = _mm256_add_epi16(step2[7], kOne); + step2[8] = _mm256_add_epi16(step2[8], kOne); + step2[9] = _mm256_add_epi16(step2[9], kOne); + step2[10] = _mm256_add_epi16(step2[10], kOne); + step2[11] = _mm256_add_epi16(step2[11], kOne); + step2[12] = _mm256_add_epi16(step2[12], kOne); + step2[13] = _mm256_add_epi16(step2[13], kOne); + step2[14] = _mm256_add_epi16(step2[14], kOne); + step2[15] = _mm256_add_epi16(step2[15], kOne); + step1[16] = _mm256_add_epi16(step1[16], kOne); + step1[17] = _mm256_add_epi16(step1[17], kOne); + step1[18] = _mm256_add_epi16(step1[18], kOne); + step1[19] = _mm256_add_epi16(step1[19], kOne); + step2[20] = _mm256_add_epi16(step2[20], kOne); + step2[21] = _mm256_add_epi16(step2[21], kOne); + step2[22] = _mm256_add_epi16(step2[22], kOne); + step2[23] = _mm256_add_epi16(step2[23], kOne); + step2[24] = _mm256_add_epi16(step2[24], kOne); + step2[25] = _mm256_add_epi16(step2[25], kOne); + step2[26] = _mm256_add_epi16(step2[26], kOne); + step2[27] = _mm256_add_epi16(step2[27], kOne); + step1[28] = _mm256_add_epi16(step1[28], kOne); + step1[29] = _mm256_add_epi16(step1[29], kOne); + step1[30] = _mm256_add_epi16(step1[30], kOne); + step1[31] = _mm256_add_epi16(step1[31], kOne); + + step2[0] = _mm256_srai_epi16(step2[0], 2); + step2[1] = _mm256_srai_epi16(step2[1], 2); + step2[2] = _mm256_srai_epi16(step2[2], 2); + step2[3] = _mm256_srai_epi16(step2[3], 2); + step2[4] = _mm256_srai_epi16(step2[4], 2); + step2[5] = _mm256_srai_epi16(step2[5], 2); + step2[6] = _mm256_srai_epi16(step2[6], 2); + step2[7] = _mm256_srai_epi16(step2[7], 2); + step2[8] = _mm256_srai_epi16(step2[8], 2); + step2[9] = _mm256_srai_epi16(step2[9], 2); + step2[10] = _mm256_srai_epi16(step2[10], 2); + step2[11] = _mm256_srai_epi16(step2[11], 2); + step2[12] = _mm256_srai_epi16(step2[12], 2); + step2[13] = _mm256_srai_epi16(step2[13], 2); + step2[14] = _mm256_srai_epi16(step2[14], 2); + step2[15] = _mm256_srai_epi16(step2[15], 2); + step1[16] = _mm256_srai_epi16(step1[16], 2); + step1[17] = _mm256_srai_epi16(step1[17], 2); + step1[18] = _mm256_srai_epi16(step1[18], 2); + step1[19] = _mm256_srai_epi16(step1[19], 2); + step2[20] = _mm256_srai_epi16(step2[20], 2); + step2[21] = _mm256_srai_epi16(step2[21], 2); + step2[22] = _mm256_srai_epi16(step2[22], 2); + step2[23] = _mm256_srai_epi16(step2[23], 2); + step2[24] = _mm256_srai_epi16(step2[24], 2); + step2[25] = _mm256_srai_epi16(step2[25], 2); + step2[26] = _mm256_srai_epi16(step2[26], 2); + step2[27] = _mm256_srai_epi16(step2[27], 2); + step1[28] = _mm256_srai_epi16(step1[28], 2); + step1[29] = _mm256_srai_epi16(step1[29], 2); + step1[30] = _mm256_srai_epi16(step1[30], 2); + step1[31] = _mm256_srai_epi16(step1[31], 2); + } +#endif + +#if FDCT32x32_HIGH_PRECISION + if (pass == 0) { +#endif + // Stage 3 + { + step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]); + step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]); + step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]); + step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]); + step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]); + step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]); + step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]); + step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]); + } + { + const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); + const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); + const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); + const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); + const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s3_10_4 = + _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m256i s3_10_5 = + _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m256i s3_11_4 = + _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m256i s3_11_5 = + _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m256i s3_12_4 = + _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m256i s3_12_5 = + _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m256i s3_13_4 = + _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m256i s3_13_5 = + _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7); + } + { + step3[16] = _mm256_add_epi16(step2[23], step1[16]); + step3[17] = _mm256_add_epi16(step2[22], step1[17]); + step3[18] = _mm256_add_epi16(step2[21], step1[18]); + step3[19] = _mm256_add_epi16(step2[20], step1[19]); + step3[20] = _mm256_sub_epi16(step1[19], step2[20]); + step3[21] = _mm256_sub_epi16(step1[18], step2[21]); + step3[22] = _mm256_sub_epi16(step1[17], step2[22]); + step3[23] = _mm256_sub_epi16(step1[16], step2[23]); + step3[24] = _mm256_sub_epi16(step1[31], step2[24]); + step3[25] = _mm256_sub_epi16(step1[30], step2[25]); + step3[26] = _mm256_sub_epi16(step1[29], step2[26]); + step3[27] = _mm256_sub_epi16(step1[28], step2[27]); + step3[28] = _mm256_add_epi16(step2[27], step1[28]); + step3[29] = _mm256_add_epi16(step2[26], step1[29]); + step3[30] = _mm256_add_epi16(step2[25], step1[30]); + step3[31] = _mm256_add_epi16(step2[24], step1[31]); + } + + // Stage 4 + { + step1[0] = _mm256_add_epi16(step3[3], step3[0]); + step1[1] = _mm256_add_epi16(step3[2], step3[1]); + step1[2] = _mm256_sub_epi16(step3[1], step3[2]); + step1[3] = _mm256_sub_epi16(step3[0], step3[3]); + step1[8] = _mm256_add_epi16(step3[11], step2[8]); + step1[9] = _mm256_add_epi16(step3[10], step2[9]); + step1[10] = _mm256_sub_epi16(step2[9], step3[10]); + step1[11] = _mm256_sub_epi16(step2[8], step3[11]); + step1[12] = _mm256_sub_epi16(step2[15], step3[12]); + step1[13] = _mm256_sub_epi16(step2[14], step3[13]); + step1[14] = _mm256_add_epi16(step3[13], step2[14]); + step1[15] = _mm256_add_epi16(step3[12], step2[15]); + } + { + const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]); + const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]); + const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s1_05_4 = + _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m256i s1_05_5 = + _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m256i s1_06_4 = + _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m256i s1_06_5 = + _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7); + } + { + const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]); + const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]); + const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]); + const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]); + const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]); + const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]); + const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]); + const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]); + const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m256i s1_18_4 = + _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m256i s1_18_5 = + _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m256i s1_19_4 = + _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m256i s1_19_5 = + _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m256i s1_20_4 = + _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m256i s1_20_5 = + _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m256i s1_21_4 = + _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m256i s1_21_5 = + _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m256i s1_26_4 = + _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m256i s1_26_5 = + _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m256i s1_27_4 = + _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m256i s1_27_5 = + _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m256i s1_28_4 = + _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m256i s1_28_5 = + _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m256i s1_29_4 = + _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m256i s1_29_5 = + _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7); + } + // Stage 5 + { + step2[4] = _mm256_add_epi16(step1[5], step3[4]); + step2[5] = _mm256_sub_epi16(step3[4], step1[5]); + step2[6] = _mm256_sub_epi16(step3[7], step1[6]); + step2[7] = _mm256_add_epi16(step1[6], step3[7]); + } + { + const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]); + const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]); + const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]); + const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]); + const __m256i out_00_2 = + _mm256_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m256i out_00_3 = + _mm256_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m256i out_16_2 = + _mm256_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m256i out_16_3 = + _mm256_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m256i out_08_2 = + _mm256_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m256i out_08_3 = + _mm256_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m256i out_24_2 = + _mm256_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m256i out_24_3 = + _mm256_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m256i out_00_4 = + _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m256i out_00_5 = + _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m256i out_16_4 = + _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m256i out_16_5 = + _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m256i out_08_4 = + _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m256i out_08_5 = + _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m256i out_24_4 = + _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m256i out_24_5 = + _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[0] = _mm256_packs_epi32(out_00_6, out_00_7); + out[16] = _mm256_packs_epi32(out_16_6, out_16_7); + out[8] = _mm256_packs_epi32(out_08_6, out_08_7); + out[24] = _mm256_packs_epi32(out_24_6, out_24_7); + } + { + const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]); + const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]); + const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]); + const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]); + const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m256i s2_09_4 = + _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m256i s2_09_5 = + _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m256i s2_10_4 = + _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m256i s2_10_5 = + _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m256i s2_13_4 = + _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m256i s2_13_5 = + _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m256i s2_14_4 = + _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m256i s2_14_5 = + _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7); + } + { + step2[16] = _mm256_add_epi16(step1[19], step3[16]); + step2[17] = _mm256_add_epi16(step1[18], step3[17]); + step2[18] = _mm256_sub_epi16(step3[17], step1[18]); + step2[19] = _mm256_sub_epi16(step3[16], step1[19]); + step2[20] = _mm256_sub_epi16(step3[23], step1[20]); + step2[21] = _mm256_sub_epi16(step3[22], step1[21]); + step2[22] = _mm256_add_epi16(step1[21], step3[22]); + step2[23] = _mm256_add_epi16(step1[20], step3[23]); + step2[24] = _mm256_add_epi16(step1[27], step3[24]); + step2[25] = _mm256_add_epi16(step1[26], step3[25]); + step2[26] = _mm256_sub_epi16(step3[25], step1[26]); + step2[27] = _mm256_sub_epi16(step3[24], step1[27]); + step2[28] = _mm256_sub_epi16(step3[31], step1[28]); + step2[29] = _mm256_sub_epi16(step3[30], step1[29]); + step2[30] = _mm256_add_epi16(step1[29], step3[30]); + step2[31] = _mm256_add_epi16(step1[28], step3[31]); + } + // Stage 6 + { + const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); + const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); + const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); + const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); + const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); + const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); + const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); + const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); + const __m256i out_04_2 = + _mm256_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m256i out_04_3 = + _mm256_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m256i out_20_2 = + _mm256_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m256i out_20_3 = + _mm256_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m256i out_12_2 = + _mm256_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m256i out_12_3 = + _mm256_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m256i out_28_2 = + _mm256_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m256i out_28_3 = + _mm256_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m256i out_04_4 = + _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m256i out_04_5 = + _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m256i out_20_4 = + _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m256i out_20_5 = + _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m256i out_12_4 = + _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m256i out_12_5 = + _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m256i out_28_4 = + _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m256i out_28_5 = + _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[4] = _mm256_packs_epi32(out_04_6, out_04_7); + out[20] = _mm256_packs_epi32(out_20_6, out_20_7); + out[12] = _mm256_packs_epi32(out_12_6, out_12_7); + out[28] = _mm256_packs_epi32(out_28_6, out_28_7); + } + { + step3[8] = _mm256_add_epi16(step2[9], step1[8]); + step3[9] = _mm256_sub_epi16(step1[8], step2[9]); + step3[10] = _mm256_sub_epi16(step1[11], step2[10]); + step3[11] = _mm256_add_epi16(step2[10], step1[11]); + step3[12] = _mm256_add_epi16(step2[13], step1[12]); + step3[13] = _mm256_sub_epi16(step1[12], step2[13]); + step3[14] = _mm256_sub_epi16(step1[15], step2[14]); + step3[15] = _mm256_add_epi16(step2[14], step1[15]); + } + { + const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]); + const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]); + const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]); + const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]); + const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]); + const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]); + const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]); + const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]); + const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m256i s3_17_4 = + _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m256i s3_17_5 = + _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m256i s3_18_4 = + _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m256i s3_18_5 = + _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m256i s3_21_4 = + _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m256i s3_21_5 = + _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m256i s3_22_4 = + _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m256i s3_22_5 = + _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m256i s3_25_4 = + _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m256i s3_25_5 = + _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m256i s3_26_4 = + _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m256i s3_26_5 = + _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m256i s3_29_4 = + _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m256i s3_29_5 = + _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m256i s3_30_4 = + _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m256i s3_30_5 = + _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7); + } + // Stage 7 + { + const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]); + const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]); + const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]); + const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]); + const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]); + const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]); + const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]); + const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]); + const __m256i out_02_2 = + _mm256_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m256i out_02_3 = + _mm256_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m256i out_18_2 = + _mm256_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m256i out_18_3 = + _mm256_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m256i out_10_2 = + _mm256_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m256i out_10_3 = + _mm256_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m256i out_26_2 = + _mm256_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m256i out_26_3 = + _mm256_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m256i out_06_2 = + _mm256_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m256i out_06_3 = + _mm256_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m256i out_22_2 = + _mm256_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m256i out_22_3 = + _mm256_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m256i out_14_2 = + _mm256_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m256i out_14_3 = + _mm256_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m256i out_30_2 = + _mm256_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m256i out_30_3 = + _mm256_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m256i out_02_4 = + _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m256i out_02_5 = + _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m256i out_18_4 = + _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m256i out_18_5 = + _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m256i out_10_4 = + _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m256i out_10_5 = + _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m256i out_26_4 = + _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m256i out_26_5 = + _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m256i out_06_4 = + _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m256i out_06_5 = + _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m256i out_22_4 = + _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m256i out_22_5 = + _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m256i out_14_4 = + _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m256i out_14_5 = + _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m256i out_30_4 = + _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m256i out_30_5 = + _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[2] = _mm256_packs_epi32(out_02_6, out_02_7); + out[18] = _mm256_packs_epi32(out_18_6, out_18_7); + out[10] = _mm256_packs_epi32(out_10_6, out_10_7); + out[26] = _mm256_packs_epi32(out_26_6, out_26_7); + out[6] = _mm256_packs_epi32(out_06_6, out_06_7); + out[22] = _mm256_packs_epi32(out_22_6, out_22_7); + out[14] = _mm256_packs_epi32(out_14_6, out_14_7); + out[30] = _mm256_packs_epi32(out_30_6, out_30_7); + } + { + step1[16] = _mm256_add_epi16(step3[17], step2[16]); + step1[17] = _mm256_sub_epi16(step2[16], step3[17]); + step1[18] = _mm256_sub_epi16(step2[19], step3[18]); + step1[19] = _mm256_add_epi16(step3[18], step2[19]); + step1[20] = _mm256_add_epi16(step3[21], step2[20]); + step1[21] = _mm256_sub_epi16(step2[20], step3[21]); + step1[22] = _mm256_sub_epi16(step2[23], step3[22]); + step1[23] = _mm256_add_epi16(step3[22], step2[23]); + step1[24] = _mm256_add_epi16(step3[25], step2[24]); + step1[25] = _mm256_sub_epi16(step2[24], step3[25]); + step1[26] = _mm256_sub_epi16(step2[27], step3[26]); + step1[27] = _mm256_add_epi16(step3[26], step2[27]); + step1[28] = _mm256_add_epi16(step3[29], step2[28]); + step1[29] = _mm256_sub_epi16(step2[28], step3[29]); + step1[30] = _mm256_sub_epi16(step2[31], step3[30]); + step1[31] = _mm256_add_epi16(step3[30], step2[31]); + } + // Final stage --- outputs indices are bit-reversed. + { + const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]); + const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]); + const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]); + const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]); + const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]); + const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]); + const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]); + const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]); + const __m256i out_01_2 = + _mm256_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m256i out_01_3 = + _mm256_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m256i out_17_2 = + _mm256_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m256i out_17_3 = + _mm256_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m256i out_09_2 = + _mm256_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m256i out_09_3 = + _mm256_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m256i out_25_2 = + _mm256_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m256i out_25_3 = + _mm256_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m256i out_07_2 = + _mm256_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m256i out_07_3 = + _mm256_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m256i out_23_2 = + _mm256_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m256i out_23_3 = + _mm256_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m256i out_15_2 = + _mm256_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m256i out_15_3 = + _mm256_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m256i out_31_2 = + _mm256_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m256i out_31_3 = + _mm256_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m256i out_01_4 = + _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m256i out_01_5 = + _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m256i out_17_4 = + _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m256i out_17_5 = + _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m256i out_09_4 = + _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m256i out_09_5 = + _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m256i out_25_4 = + _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m256i out_25_5 = + _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m256i out_07_4 = + _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m256i out_07_5 = + _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m256i out_23_4 = + _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m256i out_23_5 = + _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m256i out_15_4 = + _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m256i out_15_5 = + _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m256i out_31_4 = + _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m256i out_31_5 = + _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[1] = _mm256_packs_epi32(out_01_6, out_01_7); + out[17] = _mm256_packs_epi32(out_17_6, out_17_7); + out[9] = _mm256_packs_epi32(out_09_6, out_09_7); + out[25] = _mm256_packs_epi32(out_25_6, out_25_7); + out[7] = _mm256_packs_epi32(out_07_6, out_07_7); + out[23] = _mm256_packs_epi32(out_23_6, out_23_7); + out[15] = _mm256_packs_epi32(out_15_6, out_15_7); + out[31] = _mm256_packs_epi32(out_31_6, out_31_7); + } + { + const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]); + const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]); + const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]); + const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]); + const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]); + const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]); + const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]); + const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]); + const __m256i out_05_2 = + _mm256_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m256i out_05_3 = + _mm256_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m256i out_21_2 = + _mm256_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m256i out_21_3 = + _mm256_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m256i out_13_2 = + _mm256_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m256i out_13_3 = + _mm256_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m256i out_29_2 = + _mm256_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m256i out_29_3 = + _mm256_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m256i out_03_2 = + _mm256_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m256i out_03_3 = + _mm256_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m256i out_19_2 = + _mm256_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m256i out_19_3 = + _mm256_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m256i out_11_2 = + _mm256_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m256i out_11_3 = + _mm256_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m256i out_27_2 = + _mm256_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m256i out_27_3 = + _mm256_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m256i out_05_4 = + _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m256i out_05_5 = + _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m256i out_21_4 = + _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m256i out_21_5 = + _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m256i out_13_4 = + _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m256i out_13_5 = + _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m256i out_29_4 = + _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m256i out_29_5 = + _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m256i out_03_4 = + _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m256i out_03_5 = + _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m256i out_19_4 = + _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m256i out_19_5 = + _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m256i out_11_4 = + _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m256i out_11_5 = + _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m256i out_27_4 = + _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m256i out_27_5 = + _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[5] = _mm256_packs_epi32(out_05_6, out_05_7); + out[21] = _mm256_packs_epi32(out_21_6, out_21_7); + out[13] = _mm256_packs_epi32(out_13_6, out_13_7); + out[29] = _mm256_packs_epi32(out_29_6, out_29_7); + out[3] = _mm256_packs_epi32(out_03_6, out_03_7); + out[19] = _mm256_packs_epi32(out_19_6, out_19_7); + out[11] = _mm256_packs_epi32(out_11_6, out_11_7); + out[27] = _mm256_packs_epi32(out_27_6, out_27_7); + } +#if FDCT32x32_HIGH_PRECISION + } else { + __m256i lstep1[64], lstep2[64], lstep3[64]; + __m256i u[32], v[32], sign[16]; + const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); + // start using 32-bit operations + // stage 3 + { + // expanding to 32-bit length priori to addition operations + lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero); + lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero); + lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero); + lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero); + lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero); + lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero); + lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero); + lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero); + lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero); + lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero); + lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero); + lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero); + lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero); + lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero); + lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero); + lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero); + lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne); + lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne); + lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne); + lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne); + lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne); + lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne); + lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne); + lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne); + lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne); + lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne); + lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne); + lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne); + lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne); + lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne); + lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne); + lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne); + + lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]); + lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]); + lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]); + lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]); + lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]); + lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]); + lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]); + lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]); + lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]); + lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]); + lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]); + lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]); + lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]); + lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]); + lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]); + lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]); + } + { + const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); + const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); + const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); + const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); + const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m256i s3_10_4 = + _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m256i s3_10_5 = + _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m256i s3_11_4 = + _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m256i s3_11_5 = + _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m256i s3_12_4 = + _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m256i s3_12_5 = + _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m256i s3_13_4 = + _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m256i s3_13_5 = + _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); + lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); + lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); + lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); + lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); + lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); + lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); + lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); + } + { + lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero); + lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero); + lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero); + lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero); + lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero); + lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero); + lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero); + lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero); + lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero); + lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero); + lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero); + lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero); + lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero); + lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero); + lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero); + lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero); + lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne); + lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne); + lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne); + lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne); + lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne); + lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne); + lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne); + lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne); + lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne); + lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne); + lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne); + lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne); + lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne); + lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne); + lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne); + lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne); + + lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero); + lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero); + lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero); + lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero); + lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero); + lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero); + lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero); + lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero); + lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero); + lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero); + lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero); + lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero); + lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero); + lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero); + lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero); + lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero); + lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne); + lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne); + lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne); + lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne); + lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne); + lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne); + lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne); + lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne); + lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne); + lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne); + lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne); + lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne); + lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne); + lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne); + lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne); + lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne); + + lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]); + lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]); + + lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]); + lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]); + lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]); + lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]); + lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]); + lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]); + lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]); + lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]); + lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]); + lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]); + lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]); + lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]); + lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]); + lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]); + lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]); + lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]); + lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]); + lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]); + lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]); + lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]); + lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]); + lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]); + lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]); + lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]); + lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]); + lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]); + lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]); + lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]); + lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]); + lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]); + } + + // stage 4 + { + // expanding to 32-bit length priori to addition operations + lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero); + lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero); + lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero); + lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero); + lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero); + lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero); + lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero); + lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero); + lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne); + lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne); + lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne); + lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne); + lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne); + lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne); + lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne); + lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne); + + lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]); + lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]); + lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]); + lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]); + lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]); + lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]); + lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]); + lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]); + lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]); + lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]); + lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]); + lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]); + lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]); + lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]); + lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]); + lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]); + lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]); + lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]); + lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]); + lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]); + lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]); + lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]); + lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]); + lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]); + } + { + // to be continued... + // + const __m256i k32_p16_p16 = + pair256_set_epi32(cospi_16_64, cospi_16_64); + const __m256i k32_p16_m16 = + pair256_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide + // instruction latency. + v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16); + v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16); + v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16); + v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16); + v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16); + v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16); + v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16); + v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + } + { + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_m24_m08 = + pair256_set_epi32(-cospi_24_64, -cospi_8_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]); + u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]); + u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]); + u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]); + u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]); + u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]); + u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]); + u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]); + u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]); + u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]); + u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]); + u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]); + u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]); + u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]); + u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]); + u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]); + + v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); + v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); + v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); + v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); + v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24); + v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08); + v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08); + v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08); + v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08); + v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08); + v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08); + v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08); + v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08); + v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24); + v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24); + v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24); + v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24); + v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24); + v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24); + v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24); + v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24); + v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08); + v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08); + v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08); + v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08); + v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08); + v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08); + v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08); + v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 5 + { + lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]); + lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]); + lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]); + lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]); + lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]); + lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]); + lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]); + lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]); + } + { + const __m256i k32_p16_p16 = + pair256_set_epi32(cospi_16_64, cospi_16_64); + const __m256i k32_p16_m16 = + pair256_set_epi32(cospi_16_64, -cospi_16_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]); + u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]); + u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]); + u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]); + u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]); + u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]); + u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]); + u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]); + + // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide + // instruction latency. + v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16); + v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16); + v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16); + v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16); + v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16); + v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16); + v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16); + v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16); + v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08); + v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08); + v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08); + v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08); + v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); + sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); + sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); + sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); + sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); + sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); + sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); + sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); + + u[0] = _mm256_sub_epi32(u[0], sign[0]); + u[1] = _mm256_sub_epi32(u[1], sign[1]); + u[2] = _mm256_sub_epi32(u[2], sign[2]); + u[3] = _mm256_sub_epi32(u[3], sign[3]); + u[4] = _mm256_sub_epi32(u[4], sign[4]); + u[5] = _mm256_sub_epi32(u[5], sign[5]); + u[6] = _mm256_sub_epi32(u[6], sign[6]); + u[7] = _mm256_sub_epi32(u[7], sign[7]); + + u[0] = _mm256_add_epi32(u[0], K32One); + u[1] = _mm256_add_epi32(u[1], K32One); + u[2] = _mm256_add_epi32(u[2], K32One); + u[3] = _mm256_add_epi32(u[3], K32One); + u[4] = _mm256_add_epi32(u[4], K32One); + u[5] = _mm256_add_epi32(u[5], K32One); + u[6] = _mm256_add_epi32(u[6], K32One); + u[7] = _mm256_add_epi32(u[7], K32One); + + u[0] = _mm256_srai_epi32(u[0], 2); + u[1] = _mm256_srai_epi32(u[1], 2); + u[2] = _mm256_srai_epi32(u[2], 2); + u[3] = _mm256_srai_epi32(u[3], 2); + u[4] = _mm256_srai_epi32(u[4], 2); + u[5] = _mm256_srai_epi32(u[5], 2); + u[6] = _mm256_srai_epi32(u[6], 2); + u[7] = _mm256_srai_epi32(u[7], 2); + + // Combine + out[0] = _mm256_packs_epi32(u[0], u[1]); + out[16] = _mm256_packs_epi32(u[2], u[3]); + out[8] = _mm256_packs_epi32(u[4], u[5]); + out[24] = _mm256_packs_epi32(u[6], u[7]); + } + { + const __m256i k32_m08_p24 = + pair256_set_epi32(-cospi_8_64, cospi_24_64); + const __m256i k32_m24_m08 = + pair256_set_epi32(-cospi_24_64, -cospi_8_64); + const __m256i k32_p24_p08 = + pair256_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]); + u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]); + u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]); + u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]); + u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]); + u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]); + u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]); + u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]); + + v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); + v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); + v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); + v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); + v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08); + v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08); + v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08); + v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08); + v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24); + v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24); + v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24); + v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24); + v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08); + v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08); + v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08); + v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS); + lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS); + lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS); + lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS); + lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS); + lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS); + lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS); + lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS); + } + { + lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]); + lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]); + lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]); + lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]); + lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]); + lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]); + lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]); + lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]); + lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]); + lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]); + lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]); + lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]); + lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]); + lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]); + lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]); + lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]); + lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]); + lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]); + lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]); + lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]); + lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]); + lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]); + lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]); + lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]); + lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]); + lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]); + lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]); + lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]); + lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]); + lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]); + lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]); + lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]); + } + // stage 6 + { + const __m256i k32_p28_p04 = + pair256_set_epi32(cospi_28_64, cospi_4_64); + const __m256i k32_p12_p20 = + pair256_set_epi32(cospi_12_64, cospi_20_64); + const __m256i k32_m20_p12 = + pair256_set_epi32(-cospi_20_64, cospi_12_64); + const __m256i k32_m04_p28 = + pair256_set_epi32(-cospi_4_64, cospi_28_64); + + u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); + u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); + u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); + u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); + u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); + u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); + u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); + u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); + u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); + u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); + u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); + u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); + u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); + u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); + u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); + u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04); + v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04); + v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04); + v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04); + v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20); + v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20); + v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20); + v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20); + v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); + v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); + v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); + v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); + v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28); + v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28); + v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28); + v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); + sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); + sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); + sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); + sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); + sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); + sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); + sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); + + u[0] = _mm256_sub_epi32(u[0], sign[0]); + u[1] = _mm256_sub_epi32(u[1], sign[1]); + u[2] = _mm256_sub_epi32(u[2], sign[2]); + u[3] = _mm256_sub_epi32(u[3], sign[3]); + u[4] = _mm256_sub_epi32(u[4], sign[4]); + u[5] = _mm256_sub_epi32(u[5], sign[5]); + u[6] = _mm256_sub_epi32(u[6], sign[6]); + u[7] = _mm256_sub_epi32(u[7], sign[7]); + + u[0] = _mm256_add_epi32(u[0], K32One); + u[1] = _mm256_add_epi32(u[1], K32One); + u[2] = _mm256_add_epi32(u[2], K32One); + u[3] = _mm256_add_epi32(u[3], K32One); + u[4] = _mm256_add_epi32(u[4], K32One); + u[5] = _mm256_add_epi32(u[5], K32One); + u[6] = _mm256_add_epi32(u[6], K32One); + u[7] = _mm256_add_epi32(u[7], K32One); + + u[0] = _mm256_srai_epi32(u[0], 2); + u[1] = _mm256_srai_epi32(u[1], 2); + u[2] = _mm256_srai_epi32(u[2], 2); + u[3] = _mm256_srai_epi32(u[3], 2); + u[4] = _mm256_srai_epi32(u[4], 2); + u[5] = _mm256_srai_epi32(u[5], 2); + u[6] = _mm256_srai_epi32(u[6], 2); + u[7] = _mm256_srai_epi32(u[7], 2); + + out[4] = _mm256_packs_epi32(u[0], u[1]); + out[20] = _mm256_packs_epi32(u[2], u[3]); + out[12] = _mm256_packs_epi32(u[4], u[5]); + out[28] = _mm256_packs_epi32(u[6], u[7]); + } + { + lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]); + lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]); + lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]); + lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]); + lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]); + lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]); + lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]); + lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]); + lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]); + lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]); + lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]); + lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]); + lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]); + lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]); + lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]); + lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]); + } + { + const __m256i k32_m04_p28 = + pair256_set_epi32(-cospi_4_64, cospi_28_64); + const __m256i k32_m28_m04 = + pair256_set_epi32(-cospi_28_64, -cospi_4_64); + const __m256i k32_m20_p12 = + pair256_set_epi32(-cospi_20_64, cospi_12_64); + const __m256i k32_m12_m20 = + pair256_set_epi32(-cospi_12_64, -cospi_20_64); + const __m256i k32_p12_p20 = + pair256_set_epi32(cospi_12_64, cospi_20_64); + const __m256i k32_p28_p04 = + pair256_set_epi32(cospi_28_64, cospi_4_64); + + u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]); + u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]); + u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]); + u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]); + u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]); + u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]); + u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]); + u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]); + u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]); + u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]); + u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]); + u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]); + u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]); + u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]); + u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]); + u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]); + + v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28); + v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28); + v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28); + v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28); + v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04); + v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04); + v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04); + v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04); + v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); + v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); + v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); + v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); + v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20); + v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20); + v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20); + v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20); + v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12); + v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12); + v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12); + v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12); + v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20); + v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20); + v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20); + v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20); + v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28); + v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28); + v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28); + v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28); + v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04); + v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04); + v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04); + v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 7 + { + const __m256i k32_p30_p02 = + pair256_set_epi32(cospi_30_64, cospi_2_64); + const __m256i k32_p14_p18 = + pair256_set_epi32(cospi_14_64, cospi_18_64); + const __m256i k32_p22_p10 = + pair256_set_epi32(cospi_22_64, cospi_10_64); + const __m256i k32_p06_p26 = + pair256_set_epi32(cospi_6_64, cospi_26_64); + const __m256i k32_m26_p06 = + pair256_set_epi32(-cospi_26_64, cospi_6_64); + const __m256i k32_m10_p22 = + pair256_set_epi32(-cospi_10_64, cospi_22_64); + const __m256i k32_m18_p14 = + pair256_set_epi32(-cospi_18_64, cospi_14_64); + const __m256i k32_m02_p30 = + pair256_set_epi32(-cospi_2_64, cospi_30_64); + + u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]); + u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]); + u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]); + u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]); + u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]); + u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]); + u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]); + u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]); + u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]); + u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]); + u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]); + u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]); + u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]); + u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]); + u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]); + u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02); + v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02); + v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02); + v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02); + v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18); + v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18); + v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18); + v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18); + v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10); + v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10); + v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10); + v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10); + v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26); + v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26); + v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26); + v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26); + v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06); + v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06); + v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06); + v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06); + v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22); + v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22); + v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22); + v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22); + v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14); + v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14); + v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14); + v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14); + v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30); + v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30); + v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30); + v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[2] = _mm256_packs_epi32(u[0], u[1]); + out[18] = _mm256_packs_epi32(u[2], u[3]); + out[10] = _mm256_packs_epi32(u[4], u[5]); + out[26] = _mm256_packs_epi32(u[6], u[7]); + out[6] = _mm256_packs_epi32(u[8], u[9]); + out[22] = _mm256_packs_epi32(u[10], u[11]); + out[14] = _mm256_packs_epi32(u[12], u[13]); + out[30] = _mm256_packs_epi32(u[14], u[15]); + } + { + lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]); + lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]); + lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]); + lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]); + lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]); + lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]); + lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]); + lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]); + lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]); + lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]); + lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]); + lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]); + lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]); + lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]); + lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]); + lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]); + lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]); + lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]); + lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]); + lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]); + lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]); + lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]); + lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]); + lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]); + lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]); + lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]); + lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]); + lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]); + lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]); + lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]); + lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]); + lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]); + } + // stage 8 + { + const __m256i k32_p31_p01 = + pair256_set_epi32(cospi_31_64, cospi_1_64); + const __m256i k32_p15_p17 = + pair256_set_epi32(cospi_15_64, cospi_17_64); + const __m256i k32_p23_p09 = + pair256_set_epi32(cospi_23_64, cospi_9_64); + const __m256i k32_p07_p25 = + pair256_set_epi32(cospi_7_64, cospi_25_64); + const __m256i k32_m25_p07 = + pair256_set_epi32(-cospi_25_64, cospi_7_64); + const __m256i k32_m09_p23 = + pair256_set_epi32(-cospi_9_64, cospi_23_64); + const __m256i k32_m17_p15 = + pair256_set_epi32(-cospi_17_64, cospi_15_64); + const __m256i k32_m01_p31 = + pair256_set_epi32(-cospi_1_64, cospi_31_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]); + u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]); + u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]); + u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]); + u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]); + u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]); + u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]); + u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]); + u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]); + u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]); + u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]); + u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]); + u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]); + u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]); + u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]); + u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01); + v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01); + v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01); + v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01); + v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17); + v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17); + v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17); + v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17); + v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09); + v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09); + v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09); + v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09); + v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25); + v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25); + v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25); + v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25); + v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07); + v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07); + v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07); + v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07); + v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23); + v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23); + v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23); + v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23); + v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15); + v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15); + v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15); + v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15); + v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31); + v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31); + v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31); + v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[1] = _mm256_packs_epi32(u[0], u[1]); + out[17] = _mm256_packs_epi32(u[2], u[3]); + out[9] = _mm256_packs_epi32(u[4], u[5]); + out[25] = _mm256_packs_epi32(u[6], u[7]); + out[7] = _mm256_packs_epi32(u[8], u[9]); + out[23] = _mm256_packs_epi32(u[10], u[11]); + out[15] = _mm256_packs_epi32(u[12], u[13]); + out[31] = _mm256_packs_epi32(u[14], u[15]); + } + { + const __m256i k32_p27_p05 = + pair256_set_epi32(cospi_27_64, cospi_5_64); + const __m256i k32_p11_p21 = + pair256_set_epi32(cospi_11_64, cospi_21_64); + const __m256i k32_p19_p13 = + pair256_set_epi32(cospi_19_64, cospi_13_64); + const __m256i k32_p03_p29 = + pair256_set_epi32(cospi_3_64, cospi_29_64); + const __m256i k32_m29_p03 = + pair256_set_epi32(-cospi_29_64, cospi_3_64); + const __m256i k32_m13_p19 = + pair256_set_epi32(-cospi_13_64, cospi_19_64); + const __m256i k32_m21_p11 = + pair256_set_epi32(-cospi_21_64, cospi_11_64); + const __m256i k32_m05_p27 = + pair256_set_epi32(-cospi_5_64, cospi_27_64); + + u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]); + u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]); + u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]); + u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]); + u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]); + u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]); + u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]); + u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]); + u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]); + u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]); + u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]); + u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]); + u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]); + u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]); + u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]); + u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]); + + v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05); + v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05); + v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05); + v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05); + v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21); + v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21); + v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21); + v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21); + v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13); + v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13); + v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13); + v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13); + v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29); + v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29); + v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29); + v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29); + v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03); + v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03); + v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03); + v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03); + v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19); + v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19); + v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19); + v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19); + v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11); + v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11); + v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11); + v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11); + v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27); + v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27); + v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27); + v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27); + + u[0] = k_packs_epi64_avx2(v[0], v[1]); + u[1] = k_packs_epi64_avx2(v[2], v[3]); + u[2] = k_packs_epi64_avx2(v[4], v[5]); + u[3] = k_packs_epi64_avx2(v[6], v[7]); + u[4] = k_packs_epi64_avx2(v[8], v[9]); + u[5] = k_packs_epi64_avx2(v[10], v[11]); + u[6] = k_packs_epi64_avx2(v[12], v[13]); + u[7] = k_packs_epi64_avx2(v[14], v[15]); + u[8] = k_packs_epi64_avx2(v[16], v[17]); + u[9] = k_packs_epi64_avx2(v[18], v[19]); + u[10] = k_packs_epi64_avx2(v[20], v[21]); + u[11] = k_packs_epi64_avx2(v[22], v[23]); + u[12] = k_packs_epi64_avx2(v[24], v[25]); + u[13] = k_packs_epi64_avx2(v[26], v[27]); + u[14] = k_packs_epi64_avx2(v[28], v[29]); + u[15] = k_packs_epi64_avx2(v[30], v[31]); + + v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm256_cmpgt_epi32(kZero, u[0]); + v[1] = _mm256_cmpgt_epi32(kZero, u[1]); + v[2] = _mm256_cmpgt_epi32(kZero, u[2]); + v[3] = _mm256_cmpgt_epi32(kZero, u[3]); + v[4] = _mm256_cmpgt_epi32(kZero, u[4]); + v[5] = _mm256_cmpgt_epi32(kZero, u[5]); + v[6] = _mm256_cmpgt_epi32(kZero, u[6]); + v[7] = _mm256_cmpgt_epi32(kZero, u[7]); + v[8] = _mm256_cmpgt_epi32(kZero, u[8]); + v[9] = _mm256_cmpgt_epi32(kZero, u[9]); + v[10] = _mm256_cmpgt_epi32(kZero, u[10]); + v[11] = _mm256_cmpgt_epi32(kZero, u[11]); + v[12] = _mm256_cmpgt_epi32(kZero, u[12]); + v[13] = _mm256_cmpgt_epi32(kZero, u[13]); + v[14] = _mm256_cmpgt_epi32(kZero, u[14]); + v[15] = _mm256_cmpgt_epi32(kZero, u[15]); + + u[0] = _mm256_sub_epi32(u[0], v[0]); + u[1] = _mm256_sub_epi32(u[1], v[1]); + u[2] = _mm256_sub_epi32(u[2], v[2]); + u[3] = _mm256_sub_epi32(u[3], v[3]); + u[4] = _mm256_sub_epi32(u[4], v[4]); + u[5] = _mm256_sub_epi32(u[5], v[5]); + u[6] = _mm256_sub_epi32(u[6], v[6]); + u[7] = _mm256_sub_epi32(u[7], v[7]); + u[8] = _mm256_sub_epi32(u[8], v[8]); + u[9] = _mm256_sub_epi32(u[9], v[9]); + u[10] = _mm256_sub_epi32(u[10], v[10]); + u[11] = _mm256_sub_epi32(u[11], v[11]); + u[12] = _mm256_sub_epi32(u[12], v[12]); + u[13] = _mm256_sub_epi32(u[13], v[13]); + u[14] = _mm256_sub_epi32(u[14], v[14]); + u[15] = _mm256_sub_epi32(u[15], v[15]); + + v[0] = _mm256_add_epi32(u[0], K32One); + v[1] = _mm256_add_epi32(u[1], K32One); + v[2] = _mm256_add_epi32(u[2], K32One); + v[3] = _mm256_add_epi32(u[3], K32One); + v[4] = _mm256_add_epi32(u[4], K32One); + v[5] = _mm256_add_epi32(u[5], K32One); + v[6] = _mm256_add_epi32(u[6], K32One); + v[7] = _mm256_add_epi32(u[7], K32One); + v[8] = _mm256_add_epi32(u[8], K32One); + v[9] = _mm256_add_epi32(u[9], K32One); + v[10] = _mm256_add_epi32(u[10], K32One); + v[11] = _mm256_add_epi32(u[11], K32One); + v[12] = _mm256_add_epi32(u[12], K32One); + v[13] = _mm256_add_epi32(u[13], K32One); + v[14] = _mm256_add_epi32(u[14], K32One); + v[15] = _mm256_add_epi32(u[15], K32One); + + u[0] = _mm256_srai_epi32(v[0], 2); + u[1] = _mm256_srai_epi32(v[1], 2); + u[2] = _mm256_srai_epi32(v[2], 2); + u[3] = _mm256_srai_epi32(v[3], 2); + u[4] = _mm256_srai_epi32(v[4], 2); + u[5] = _mm256_srai_epi32(v[5], 2); + u[6] = _mm256_srai_epi32(v[6], 2); + u[7] = _mm256_srai_epi32(v[7], 2); + u[8] = _mm256_srai_epi32(v[8], 2); + u[9] = _mm256_srai_epi32(v[9], 2); + u[10] = _mm256_srai_epi32(v[10], 2); + u[11] = _mm256_srai_epi32(v[11], 2); + u[12] = _mm256_srai_epi32(v[12], 2); + u[13] = _mm256_srai_epi32(v[13], 2); + u[14] = _mm256_srai_epi32(v[14], 2); + u[15] = _mm256_srai_epi32(v[15], 2); + + out[5] = _mm256_packs_epi32(u[0], u[1]); + out[21] = _mm256_packs_epi32(u[2], u[3]); + out[13] = _mm256_packs_epi32(u[4], u[5]); + out[29] = _mm256_packs_epi32(u[6], u[7]); + out[3] = _mm256_packs_epi32(u[8], u[9]); + out[19] = _mm256_packs_epi32(u[10], u[11]); + out[11] = _mm256_packs_epi32(u[12], u[13]); + out[27] = _mm256_packs_epi32(u[14], u[15]); + } + } +#endif + // Transpose the results, do it as four 8x8 transposes. + { + int transpose_block; + int16_t *output_currStep, *output_nextStep; + tran_low_t *curr_out, *next_out; + // Pass 0 + output_currStep = &intermediate[column_start * 32]; + output_nextStep = &intermediate[(column_start + 8) * 32]; + // Pass 1 + curr_out = &output_org[column_start * 32]; + next_out = &output_org[(column_start + 8) * 32]; + + for (transpose_block = 0; transpose_block < 4; ++transpose_block) { + __m256i *this_out = &out[8 * transpose_block]; + // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 + // 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 + // 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 + // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 + // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 + // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 + // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 + const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]); + const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]); + const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]); + const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]); + const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]); + const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]); + const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]); + const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]); + // 00 20 01 21 02 22 03 23 08 28 09 29 10 30 11 31 + // 40 60 41 61 42 62 43 63 48 68 49 69 50 70 51 71 + // 04 24 05 25 06 26 07 27 12 32 13 33 14 34 15 35 + // 44 64 45 65 46 66 47 67 52 72 53 73 54 74 55 75 + // 80 100 81 101 82 102 83 103 88 108 89 109 90 110 91 101 + // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151 + // 84 104 85 105 86 106 87 107 92 112 93 113 94 114 95 115 + // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155 + + const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); + const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); + const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); + const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3); + const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5); + const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); + const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); + const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); + // 00 20 40 60 01 21 41 61 08 28 48 68 09 29 49 69 + // 04 24 44 64 05 25 45 65 12 32 52 72 13 33 53 73 + // 02 22 42 62 03 23 43 63 10 30 50 70 11 31 51 71 + // 06 26 46 66 07 27 47 67 14 34 54 74 15 35 55 75 + // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149 + // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153 + // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151 + // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155 + __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148 + // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149 + // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150 + // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151 + // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152 + // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153 + // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154 + // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155 + if (0 == pass) { + // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; + // TODO(cd): see quality impact of only doing + // output[j] = (output[j] + 1) >> 2; + // which would remove the code between here ... + __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero); + __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero); + __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero); + __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero); + __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero); + __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero); + __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero); + __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero); + tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0); + tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0); + tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0); + tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0); + tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0); + tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0); + tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0); + tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0); + // ... and here. + // PS: also change code in av1/encoder/av1_dct.c + tr2_0 = _mm256_add_epi16(tr2_0, kOne); + tr2_1 = _mm256_add_epi16(tr2_1, kOne); + tr2_2 = _mm256_add_epi16(tr2_2, kOne); + tr2_3 = _mm256_add_epi16(tr2_3, kOne); + tr2_4 = _mm256_add_epi16(tr2_4, kOne); + tr2_5 = _mm256_add_epi16(tr2_5, kOne); + tr2_6 = _mm256_add_epi16(tr2_6, kOne); + tr2_7 = _mm256_add_epi16(tr2_7, kOne); + tr2_0 = _mm256_srai_epi16(tr2_0, 2); + tr2_1 = _mm256_srai_epi16(tr2_1, 2); + tr2_2 = _mm256_srai_epi16(tr2_2, 2); + tr2_3 = _mm256_srai_epi16(tr2_3, 2); + tr2_4 = _mm256_srai_epi16(tr2_4, 2); + tr2_5 = _mm256_srai_epi16(tr2_5, 2); + tr2_6 = _mm256_srai_epi16(tr2_6, 2); + tr2_7 = _mm256_srai_epi16(tr2_7, 2); + } + if (0 == pass) { + // Note: even though all these stores are aligned, using the aligned + // intrinsic make the code slightly slower. + _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), + _mm256_castsi256_si128(tr2_0)); + _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), + _mm256_castsi256_si128(tr2_1)); + _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), + _mm256_castsi256_si128(tr2_2)); + _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), + _mm256_castsi256_si128(tr2_3)); + _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), + _mm256_castsi256_si128(tr2_4)); + _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), + _mm256_castsi256_si128(tr2_5)); + _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), + _mm256_castsi256_si128(tr2_6)); + _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), + _mm256_castsi256_si128(tr2_7)); + + _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), + _mm256_extractf128_si256(tr2_0, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), + _mm256_extractf128_si256(tr2_1, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), + _mm256_extractf128_si256(tr2_2, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), + _mm256_extractf128_si256(tr2_3, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), + _mm256_extractf128_si256(tr2_4, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), + _mm256_extractf128_si256(tr2_5, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), + _mm256_extractf128_si256(tr2_6, 1)); + _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), + _mm256_extractf128_si256(tr2_7, 1)); + // Process next 8x8 + output_currStep += 8; + output_nextStep += 8; + } + if (1 == pass) { + store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32); + store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32); + store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32); + store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32); + store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32); + store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32); + store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32); + store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32); + curr_out += 8; + next_out += 8; + } + } + } + } + } + _mm256_zeroupper(); +} // NOLINT diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h new file mode 100644 index 000000000..69dd6af11 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h @@ -0,0 +1,3201 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "aom_dsp/fwd_txfm.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +// TODO(jingning) The high bit-depth version needs re-work for performance. +// The current SSE2 implementation also causes cross reference to the static +// functions in the C implementation file. +#if DCT_HIGH_BIT_DEPTH +#define ADD_EPI16 _mm_adds_epi16 +#define SUB_EPI16 _mm_subs_epi16 +#if FDCT32x32_HIGH_PRECISION +void aom_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) { + int i, j; + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; + aom_fdct32(temp_in, temp_out, 0); + for (j = 0; j < 32; ++j) + out[j + i * 32] = + (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); + } +} +#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_c +#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rows_c +#else +void aom_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) { + int i, j; + for (i = 0; i < 32; ++i) { + tran_high_t temp_in[32], temp_out[32]; + for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; + aom_fdct32(temp_in, temp_out, 1); + for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; + } +} +#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_rd_c +#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rd_rows_c +#endif // FDCT32x32_HIGH_PRECISION +#else +#define ADD_EPI16 _mm_add_epi16 +#define SUB_EPI16 _mm_sub_epi16 +#endif // DCT_HIGH_BIT_DEPTH + +void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { + // Calculate pre-multiplied strides + const int str1 = stride; + const int str2 = 2 * stride; + const int str3 = 2 * stride + str1; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); + const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); + const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); + const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); + const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); + const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); + const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); + const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); + const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); + const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); + const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); + const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); + const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); + const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); + const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); + const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); + const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i kOne = _mm_set1_epi16(1); + // Do the two transform/transpose passes + int pass; +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + for (pass = 0; pass < 2; ++pass) { + // We process eight columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 32; column_start += 8) { + __m128i step1[32]; + __m128i step2[32]; + __m128i step3[32]; + __m128i out[32]; + // Stage 1 + // Note: even though all the loads below are aligned, using the aligned + // intrinsic make the code slightly slower. + if (0 == pass) { + const int16_t *in = &input[column_start]; + // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + const int16_t *ina = in + 0 * str1; + const int16_t *inb = in + 31 * str1; + __m128i *step1a = &step1[0]; + __m128i *step1b = &step1[31]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 4 * str1; + const int16_t *inb = in + 27 * str1; + __m128i *step1a = &step1[4]; + __m128i *step1b = &step1[27]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 8 * str1; + const int16_t *inb = in + 23 * str1; + __m128i *step1a = &step1[8]; + __m128i *step1b = &step1[23]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + { + const int16_t *ina = in + 12 * str1; + const int16_t *inb = in + 19 * str1; + __m128i *step1a = &step1[12]; + __m128i *step1b = &step1[19]; + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); + step1a[0] = _mm_add_epi16(ina0, inb0); + step1a[1] = _mm_add_epi16(ina1, inb1); + step1a[2] = _mm_add_epi16(ina2, inb2); + step1a[3] = _mm_add_epi16(ina3, inb3); + step1b[-3] = _mm_sub_epi16(ina3, inb3); + step1b[-2] = _mm_sub_epi16(ina2, inb2); + step1b[-1] = _mm_sub_epi16(ina1, inb1); + step1b[-0] = _mm_sub_epi16(ina0, inb0); + step1a[0] = _mm_slli_epi16(step1a[0], 2); + step1a[1] = _mm_slli_epi16(step1a[1], 2); + step1a[2] = _mm_slli_epi16(step1a[2], 2); + step1a[3] = _mm_slli_epi16(step1a[3], 2); + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); + } + } else { + int16_t *in = &intermediate[column_start]; + // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; + // Note: using the same approach as above to have common offset is + // counter-productive as all offsets can be calculated at compile + // time. + // Note: the next four blocks could be in a loop. That would help the + // instruction cache but is actually slower. + { + __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); + __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); + __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); + __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); + __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); + __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); + __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); + __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); + step1[0] = ADD_EPI16(in00, in31); + step1[1] = ADD_EPI16(in01, in30); + step1[2] = ADD_EPI16(in02, in29); + step1[3] = ADD_EPI16(in03, in28); + step1[28] = SUB_EPI16(in03, in28); + step1[29] = SUB_EPI16(in02, in29); + step1[30] = SUB_EPI16(in01, in30); + step1[31] = SUB_EPI16(in00, in31); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2], + &step1[3], &step1[28], &step1[29], + &step1[30], &step1[31]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); + __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); + __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); + __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); + __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); + __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); + __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); + __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); + step1[4] = ADD_EPI16(in04, in27); + step1[5] = ADD_EPI16(in05, in26); + step1[6] = ADD_EPI16(in06, in25); + step1[7] = ADD_EPI16(in07, in24); + step1[24] = SUB_EPI16(in07, in24); + step1[25] = SUB_EPI16(in06, in25); + step1[26] = SUB_EPI16(in05, in26); + step1[27] = SUB_EPI16(in04, in27); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6], + &step1[7], &step1[24], &step1[25], + &step1[26], &step1[27]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); + __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); + __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); + __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); + __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); + __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); + __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); + __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); + step1[8] = ADD_EPI16(in08, in23); + step1[9] = ADD_EPI16(in09, in22); + step1[10] = ADD_EPI16(in10, in21); + step1[11] = ADD_EPI16(in11, in20); + step1[20] = SUB_EPI16(in11, in20); + step1[21] = SUB_EPI16(in10, in21); + step1[22] = SUB_EPI16(in09, in22); + step1[23] = SUB_EPI16(in08, in23); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10], + &step1[11], &step1[20], &step1[21], + &step1[22], &step1[23]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); + __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); + __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); + __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); + __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); + __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); + __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); + __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); + step1[12] = ADD_EPI16(in12, in19); + step1[13] = ADD_EPI16(in13, in18); + step1[14] = ADD_EPI16(in14, in17); + step1[15] = ADD_EPI16(in15, in16); + step1[16] = SUB_EPI16(in15, in16); + step1[17] = SUB_EPI16(in14, in17); + step1[18] = SUB_EPI16(in13, in18); + step1[19] = SUB_EPI16(in12, in19); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14], + &step1[15], &step1[16], &step1[17], + &step1[18], &step1[19]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Stage 2 + { + step2[0] = ADD_EPI16(step1[0], step1[15]); + step2[1] = ADD_EPI16(step1[1], step1[14]); + step2[2] = ADD_EPI16(step1[2], step1[13]); + step2[3] = ADD_EPI16(step1[3], step1[12]); + step2[4] = ADD_EPI16(step1[4], step1[11]); + step2[5] = ADD_EPI16(step1[5], step1[10]); + step2[6] = ADD_EPI16(step1[6], step1[9]); + step2[7] = ADD_EPI16(step1[7], step1[8]); + step2[8] = SUB_EPI16(step1[7], step1[8]); + step2[9] = SUB_EPI16(step1[6], step1[9]); + step2[10] = SUB_EPI16(step1[5], step1[10]); + step2[11] = SUB_EPI16(step1[4], step1[11]); + step2[12] = SUB_EPI16(step1[3], step1[12]); + step2[13] = SUB_EPI16(step1[2], step1[13]); + step2[14] = SUB_EPI16(step1[1], step1[14]); + step2[15] = SUB_EPI16(step1[0], step1[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], + &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], + &step2[12], &step2[13], &step2[14], &step2[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); + const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); + const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); + const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); + const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); + const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); + const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); + const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); + const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); + const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); + const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); + const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); + const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); + const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); + const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); + const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); + const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); + const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); + const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); + const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); + const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); + const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); + const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); + const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); + const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); + const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); + const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); + const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); + const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); + const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); + const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); + const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); + const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); + const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); + const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); + const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); + const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); + const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); + const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); + const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); + const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); + const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); + const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); + const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); + const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); + const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); + const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); + const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); + const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); + const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); + const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); + const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); + const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); + const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); + const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); + // Combine + step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); + step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); + step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); + step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); + step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); + step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); + step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); + step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22], + &step2[23], &step2[24], &step2[25], + &step2[26], &step2[27]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + +#if !FDCT32x32_HIGH_PRECISION + // dump the magnitude by half, hence the intermediate values are within + // the range of 16 bits. + if (1 == pass) { + __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero); + __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero); + __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero); + __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero); + __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero); + __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero); + __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero); + __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero); + __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero); + __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero); + __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); + __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); + __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); + __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero); + __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); + __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); + __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero); + __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero); + __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero); + __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero); + __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero); + __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero); + __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero); + __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero); + __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero); + __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero); + __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero); + __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero); + __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero); + __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero); + __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); + __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); + + step2[0] = SUB_EPI16(step2[0], s3_00_0); + step2[1] = SUB_EPI16(step2[1], s3_01_0); + step2[2] = SUB_EPI16(step2[2], s3_02_0); + step2[3] = SUB_EPI16(step2[3], s3_03_0); + step2[4] = SUB_EPI16(step2[4], s3_04_0); + step2[5] = SUB_EPI16(step2[5], s3_05_0); + step2[6] = SUB_EPI16(step2[6], s3_06_0); + step2[7] = SUB_EPI16(step2[7], s3_07_0); + step2[8] = SUB_EPI16(step2[8], s2_08_0); + step2[9] = SUB_EPI16(step2[9], s2_09_0); + step2[10] = SUB_EPI16(step2[10], s3_10_0); + step2[11] = SUB_EPI16(step2[11], s3_11_0); + step2[12] = SUB_EPI16(step2[12], s3_12_0); + step2[13] = SUB_EPI16(step2[13], s3_13_0); + step2[14] = SUB_EPI16(step2[14], s2_14_0); + step2[15] = SUB_EPI16(step2[15], s2_15_0); + step1[16] = SUB_EPI16(step1[16], s3_16_0); + step1[17] = SUB_EPI16(step1[17], s3_17_0); + step1[18] = SUB_EPI16(step1[18], s3_18_0); + step1[19] = SUB_EPI16(step1[19], s3_19_0); + step2[20] = SUB_EPI16(step2[20], s3_20_0); + step2[21] = SUB_EPI16(step2[21], s3_21_0); + step2[22] = SUB_EPI16(step2[22], s3_22_0); + step2[23] = SUB_EPI16(step2[23], s3_23_0); + step2[24] = SUB_EPI16(step2[24], s3_24_0); + step2[25] = SUB_EPI16(step2[25], s3_25_0); + step2[26] = SUB_EPI16(step2[26], s3_26_0); + step2[27] = SUB_EPI16(step2[27], s3_27_0); + step1[28] = SUB_EPI16(step1[28], s3_28_0); + step1[29] = SUB_EPI16(step1[29], s3_29_0); + step1[30] = SUB_EPI16(step1[30], s3_30_0); + step1[31] = SUB_EPI16(step1[31], s3_31_0); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x32( + &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], + &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], + &step2[12], &step2[13], &step2[14], &step2[15], &step1[16], + &step1[17], &step1[18], &step1[19], &step2[20], &step2[21], + &step2[22], &step2[23], &step2[24], &step2[25], &step2[26], + &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + step2[0] = _mm_add_epi16(step2[0], kOne); + step2[1] = _mm_add_epi16(step2[1], kOne); + step2[2] = _mm_add_epi16(step2[2], kOne); + step2[3] = _mm_add_epi16(step2[3], kOne); + step2[4] = _mm_add_epi16(step2[4], kOne); + step2[5] = _mm_add_epi16(step2[5], kOne); + step2[6] = _mm_add_epi16(step2[6], kOne); + step2[7] = _mm_add_epi16(step2[7], kOne); + step2[8] = _mm_add_epi16(step2[8], kOne); + step2[9] = _mm_add_epi16(step2[9], kOne); + step2[10] = _mm_add_epi16(step2[10], kOne); + step2[11] = _mm_add_epi16(step2[11], kOne); + step2[12] = _mm_add_epi16(step2[12], kOne); + step2[13] = _mm_add_epi16(step2[13], kOne); + step2[14] = _mm_add_epi16(step2[14], kOne); + step2[15] = _mm_add_epi16(step2[15], kOne); + step1[16] = _mm_add_epi16(step1[16], kOne); + step1[17] = _mm_add_epi16(step1[17], kOne); + step1[18] = _mm_add_epi16(step1[18], kOne); + step1[19] = _mm_add_epi16(step1[19], kOne); + step2[20] = _mm_add_epi16(step2[20], kOne); + step2[21] = _mm_add_epi16(step2[21], kOne); + step2[22] = _mm_add_epi16(step2[22], kOne); + step2[23] = _mm_add_epi16(step2[23], kOne); + step2[24] = _mm_add_epi16(step2[24], kOne); + step2[25] = _mm_add_epi16(step2[25], kOne); + step2[26] = _mm_add_epi16(step2[26], kOne); + step2[27] = _mm_add_epi16(step2[27], kOne); + step1[28] = _mm_add_epi16(step1[28], kOne); + step1[29] = _mm_add_epi16(step1[29], kOne); + step1[30] = _mm_add_epi16(step1[30], kOne); + step1[31] = _mm_add_epi16(step1[31], kOne); + + step2[0] = _mm_srai_epi16(step2[0], 2); + step2[1] = _mm_srai_epi16(step2[1], 2); + step2[2] = _mm_srai_epi16(step2[2], 2); + step2[3] = _mm_srai_epi16(step2[3], 2); + step2[4] = _mm_srai_epi16(step2[4], 2); + step2[5] = _mm_srai_epi16(step2[5], 2); + step2[6] = _mm_srai_epi16(step2[6], 2); + step2[7] = _mm_srai_epi16(step2[7], 2); + step2[8] = _mm_srai_epi16(step2[8], 2); + step2[9] = _mm_srai_epi16(step2[9], 2); + step2[10] = _mm_srai_epi16(step2[10], 2); + step2[11] = _mm_srai_epi16(step2[11], 2); + step2[12] = _mm_srai_epi16(step2[12], 2); + step2[13] = _mm_srai_epi16(step2[13], 2); + step2[14] = _mm_srai_epi16(step2[14], 2); + step2[15] = _mm_srai_epi16(step2[15], 2); + step1[16] = _mm_srai_epi16(step1[16], 2); + step1[17] = _mm_srai_epi16(step1[17], 2); + step1[18] = _mm_srai_epi16(step1[18], 2); + step1[19] = _mm_srai_epi16(step1[19], 2); + step2[20] = _mm_srai_epi16(step2[20], 2); + step2[21] = _mm_srai_epi16(step2[21], 2); + step2[22] = _mm_srai_epi16(step2[22], 2); + step2[23] = _mm_srai_epi16(step2[23], 2); + step2[24] = _mm_srai_epi16(step2[24], 2); + step2[25] = _mm_srai_epi16(step2[25], 2); + step2[26] = _mm_srai_epi16(step2[26], 2); + step2[27] = _mm_srai_epi16(step2[27], 2); + step1[28] = _mm_srai_epi16(step1[28], 2); + step1[29] = _mm_srai_epi16(step1[29], 2); + step1[30] = _mm_srai_epi16(step1[30], 2); + step1[31] = _mm_srai_epi16(step1[31], 2); + } +#endif // !FDCT32x32_HIGH_PRECISION + +#if FDCT32x32_HIGH_PRECISION + if (pass == 0) { +#endif + // Stage 3 + { + step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]); + step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]); + step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]); + step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]); + step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]); + step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]); + step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]); + step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2], + &step3[3], &step3[4], &step3[5], + &step3[6], &step3[7]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + // Combine + step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); + step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); + step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); + step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12], + &step3[13]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step3[16] = ADD_EPI16(step2[23], step1[16]); + step3[17] = ADD_EPI16(step2[22], step1[17]); + step3[18] = ADD_EPI16(step2[21], step1[18]); + step3[19] = ADD_EPI16(step2[20], step1[19]); + step3[20] = SUB_EPI16(step1[19], step2[20]); + step3[21] = SUB_EPI16(step1[18], step2[21]); + step3[22] = SUB_EPI16(step1[17], step2[22]); + step3[23] = SUB_EPI16(step1[16], step2[23]); + step3[24] = SUB_EPI16(step1[31], step2[24]); + step3[25] = SUB_EPI16(step1[30], step2[25]); + step3[26] = SUB_EPI16(step1[29], step2[26]); + step3[27] = SUB_EPI16(step1[28], step2[27]); + step3[28] = ADD_EPI16(step2[27], step1[28]); + step3[29] = ADD_EPI16(step2[26], step1[29]); + step3[30] = ADD_EPI16(step2[25], step1[30]); + step3[31] = ADD_EPI16(step2[24], step1[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step3[16], &step3[17], &step3[18], &step3[19], &step3[20], + &step3[21], &step3[22], &step3[23], &step3[24], &step3[25], + &step3[26], &step3[27], &step3[28], &step3[29], &step3[30], + &step3[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + + // Stage 4 + { + step1[0] = ADD_EPI16(step3[3], step3[0]); + step1[1] = ADD_EPI16(step3[2], step3[1]); + step1[2] = SUB_EPI16(step3[1], step3[2]); + step1[3] = SUB_EPI16(step3[0], step3[3]); + step1[8] = ADD_EPI16(step3[11], step2[8]); + step1[9] = ADD_EPI16(step3[10], step2[9]); + step1[10] = SUB_EPI16(step2[9], step3[10]); + step1[11] = SUB_EPI16(step2[8], step3[11]); + step1[12] = SUB_EPI16(step2[15], step3[12]); + step1[13] = SUB_EPI16(step2[14], step3[13]); + step1[14] = ADD_EPI16(step3[13], step2[14]); + step1[15] = ADD_EPI16(step3[12], step2[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5], + &step1[6], &step1[7], &step1[8], &step1[9], &step1[10], + &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); + const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); + const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); + const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); + const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); + const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); + const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); + const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); + const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); + const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); + const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); + const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); + const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); + // Combine + step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); + step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&step1[5], &step1[6]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); + const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); + const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); + const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); + const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); + const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); + const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); + const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); + const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); + const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); + const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); + const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); + const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); + const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); + const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); + const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); + const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); + const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); + const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); + const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); + const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); + const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); + const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); + const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); + const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); + const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); + const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); + const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); + const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); + const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); + const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); + const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); + const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); + const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); + const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); + const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); + const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); + const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); + const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); + const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); + const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); + const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); + const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); + const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); + const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); + const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); + const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); + const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); + const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); + const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); + const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); + const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); + const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); + const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); + const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); + // Combine + step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); + step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); + step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); + step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); + step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); + step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); + step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); + step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20], + &step1[21], &step1[26], &step1[27], + &step1[28], &step1[29]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Stage 5 + { + step2[4] = ADD_EPI16(step1[5], step3[4]); + step2[5] = SUB_EPI16(step3[4], step1[5]); + step2[6] = SUB_EPI16(step3[7], step1[6]); + step2[7] = ADD_EPI16(step1[6], step3[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6], + &step2[7]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); + const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); + const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); + const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); + const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); + const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); + const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); + const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); + const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); + const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); + const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); + const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i out_00_4 = + _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); + const __m128i out_00_5 = + _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); + const __m128i out_16_4 = + _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); + const __m128i out_16_5 = + _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); + const __m128i out_08_4 = + _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); + const __m128i out_08_5 = + _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); + const __m128i out_24_4 = + _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); + const __m128i out_24_5 = + _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); + const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); + const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); + const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); + const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); + const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); + const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); + const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); + const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); + // Combine + out[0] = _mm_packs_epi32(out_00_6, out_00_7); + out[16] = _mm_packs_epi32(out_16_6, out_16_7); + out[8] = _mm_packs_epi32(out_08_6, out_08_7); + out[24] = _mm_packs_epi32(out_24_6, out_24_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); + const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); + const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); + const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); + const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); + const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); + const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); + const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); + const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); + const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); + const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); + const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); + // dct_const_round_shift + const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); + const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); + const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); + const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); + const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); + const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); + const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); + const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); + const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); + const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); + const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); + const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); + const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); + const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); + const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); + const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); + // Combine + step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); + step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); + step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); + step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13], + &step2[14]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step2[16] = ADD_EPI16(step1[19], step3[16]); + step2[17] = ADD_EPI16(step1[18], step3[17]); + step2[18] = SUB_EPI16(step3[17], step1[18]); + step2[19] = SUB_EPI16(step3[16], step1[19]); + step2[20] = SUB_EPI16(step3[23], step1[20]); + step2[21] = SUB_EPI16(step3[22], step1[21]); + step2[22] = ADD_EPI16(step1[21], step3[22]); + step2[23] = ADD_EPI16(step1[20], step3[23]); + step2[24] = ADD_EPI16(step1[27], step3[24]); + step2[25] = ADD_EPI16(step1[26], step3[25]); + step2[26] = SUB_EPI16(step3[25], step1[26]); + step2[27] = SUB_EPI16(step3[24], step1[27]); + step2[28] = SUB_EPI16(step3[31], step1[28]); + step2[29] = SUB_EPI16(step3[30], step1[29]); + step2[30] = ADD_EPI16(step1[29], step3[30]); + step2[31] = ADD_EPI16(step1[28], step3[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step2[16], &step2[17], &step2[18], &step2[19], &step2[20], + &step2[21], &step2[22], &step2[23], &step2[24], &step2[25], + &step2[26], &step2[27], &step2[28], &step2[29], &step2[30], + &step2[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Stage 6 + { + const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); + const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); + const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); + const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); + const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); + const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); + const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); + const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); + const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); + const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); + const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); + const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); + // dct_const_round_shift + const __m128i out_04_4 = + _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); + const __m128i out_04_5 = + _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); + const __m128i out_20_4 = + _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); + const __m128i out_20_5 = + _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); + const __m128i out_12_4 = + _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); + const __m128i out_12_5 = + _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); + const __m128i out_28_4 = + _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); + const __m128i out_28_5 = + _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); + const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); + const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); + const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); + const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); + const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); + const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); + const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); + const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); + // Combine + out[4] = _mm_packs_epi32(out_04_6, out_04_7); + out[20] = _mm_packs_epi32(out_20_6, out_20_7); + out[12] = _mm_packs_epi32(out_12_6, out_12_7); + out[28] = _mm_packs_epi32(out_28_6, out_28_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step3[8] = ADD_EPI16(step2[9], step1[8]); + step3[9] = SUB_EPI16(step1[8], step2[9]); + step3[10] = SUB_EPI16(step1[11], step2[10]); + step3[11] = ADD_EPI16(step2[10], step1[11]); + step3[12] = ADD_EPI16(step2[13], step1[12]); + step3[13] = SUB_EPI16(step1[12], step2[13]); + step3[14] = SUB_EPI16(step1[15], step2[14]); + step3[15] = ADD_EPI16(step2[14], step1[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10], + &step3[11], &step3[12], &step3[13], + &step3[14], &step3[15]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); + const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); + const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); + const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); + const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); + const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); + const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); + const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); + const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); + const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); + const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); + const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); + const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); + const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); + const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); + const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); + const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); + const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); + const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); + const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); + const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); + const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); + const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); + const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); + // dct_const_round_shift + const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); + const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); + const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); + const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); + const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); + const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); + const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); + const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); + const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); + const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); + const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); + const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); + const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); + const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); + const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); + const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); + const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); + const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); + const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); + const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); + const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); + const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); + const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); + const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); + const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); + const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); + const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); + const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); + const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); + const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); + const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); + const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); + // Combine + step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); + step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); + step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); + step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); + // Combine + step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); + step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); + step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); + step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21], + &step3[22], &step3[25], &step3[26], + &step3[29], &step3[30]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Stage 7 + { + const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); + const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); + const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); + const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); + const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); + const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); + const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); + const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); + const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); + const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); + const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); + const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); + const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); + const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); + const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); + const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); + const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); + const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); + const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); + const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); + const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); + const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); + const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); + const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); + // dct_const_round_shift + const __m128i out_02_4 = + _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); + const __m128i out_02_5 = + _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); + const __m128i out_18_4 = + _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); + const __m128i out_18_5 = + _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); + const __m128i out_10_4 = + _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); + const __m128i out_10_5 = + _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); + const __m128i out_26_4 = + _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); + const __m128i out_26_5 = + _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); + const __m128i out_06_4 = + _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); + const __m128i out_06_5 = + _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); + const __m128i out_22_4 = + _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); + const __m128i out_22_5 = + _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); + const __m128i out_14_4 = + _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); + const __m128i out_14_5 = + _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); + const __m128i out_30_4 = + _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); + const __m128i out_30_5 = + _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); + const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); + const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); + const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); + const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); + const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); + const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); + const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); + const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); + const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); + const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); + const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); + const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); + const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); + const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); + const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); + const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); + // Combine + out[2] = _mm_packs_epi32(out_02_6, out_02_7); + out[18] = _mm_packs_epi32(out_18_6, out_18_7); + out[10] = _mm_packs_epi32(out_10_6, out_10_7); + out[26] = _mm_packs_epi32(out_26_6, out_26_7); + out[6] = _mm_packs_epi32(out_06_6, out_06_7); + out[22] = _mm_packs_epi32(out_22_6, out_22_7); + out[14] = _mm_packs_epi32(out_14_6, out_14_7); + out[30] = _mm_packs_epi32(out_30_6, out_30_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], + &out[6], &out[22], &out[14], &out[30]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + step1[16] = ADD_EPI16(step3[17], step2[16]); + step1[17] = SUB_EPI16(step2[16], step3[17]); + step1[18] = SUB_EPI16(step2[19], step3[18]); + step1[19] = ADD_EPI16(step3[18], step2[19]); + step1[20] = ADD_EPI16(step3[21], step2[20]); + step1[21] = SUB_EPI16(step2[20], step3[21]); + step1[22] = SUB_EPI16(step2[23], step3[22]); + step1[23] = ADD_EPI16(step3[22], step2[23]); + step1[24] = ADD_EPI16(step3[25], step2[24]); + step1[25] = SUB_EPI16(step2[24], step3[25]); + step1[26] = SUB_EPI16(step2[27], step3[26]); + step1[27] = ADD_EPI16(step3[26], step2[27]); + step1[28] = ADD_EPI16(step3[29], step2[28]); + step1[29] = SUB_EPI16(step2[28], step3[29]); + step1[30] = SUB_EPI16(step2[31], step3[30]); + step1[31] = ADD_EPI16(step3[30], step2[31]); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x16( + &step1[16], &step1[17], &step1[18], &step1[19], &step1[20], + &step1[21], &step1[22], &step1[23], &step1[24], &step1[25], + &step1[26], &step1[27], &step1[28], &step1[29], &step1[30], + &step1[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Final stage --- outputs indices are bit-reversed. + { + const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); + const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); + const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); + const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); + const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); + const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); + const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); + const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); + const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); + const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); + const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); + const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); + const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); + const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); + const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); + const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); + const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); + const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); + const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); + const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); + const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); + const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); + const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); + const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); + // dct_const_round_shift + const __m128i out_01_4 = + _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); + const __m128i out_01_5 = + _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); + const __m128i out_17_4 = + _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); + const __m128i out_17_5 = + _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); + const __m128i out_09_4 = + _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); + const __m128i out_09_5 = + _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); + const __m128i out_25_4 = + _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); + const __m128i out_25_5 = + _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); + const __m128i out_07_4 = + _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); + const __m128i out_07_5 = + _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); + const __m128i out_23_4 = + _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); + const __m128i out_23_5 = + _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); + const __m128i out_15_4 = + _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); + const __m128i out_15_5 = + _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); + const __m128i out_31_4 = + _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); + const __m128i out_31_5 = + _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); + const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); + const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); + const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); + const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); + const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); + const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); + const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); + const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); + const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); + const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); + const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); + const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); + const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); + const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); + const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); + const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); + // Combine + out[1] = _mm_packs_epi32(out_01_6, out_01_7); + out[17] = _mm_packs_epi32(out_17_6, out_17_7); + out[9] = _mm_packs_epi32(out_09_6, out_09_7); + out[25] = _mm_packs_epi32(out_25_6, out_25_7); + out[7] = _mm_packs_epi32(out_07_6, out_07_7); + out[23] = _mm_packs_epi32(out_23_6, out_23_7); + out[15] = _mm_packs_epi32(out_15_6, out_15_7); + out[31] = _mm_packs_epi32(out_31_6, out_31_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], + &out[7], &out[23], &out[15], &out[31]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); + const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); + const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); + const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); + const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); + const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); + const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); + const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); + const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); + const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); + const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); + const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); + const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); + const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); + const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); + const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); + const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); + const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); + const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); + const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); + const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); + const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); + const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); + const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); + // dct_const_round_shift + const __m128i out_05_4 = + _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); + const __m128i out_05_5 = + _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); + const __m128i out_21_4 = + _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); + const __m128i out_21_5 = + _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); + const __m128i out_13_4 = + _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); + const __m128i out_13_5 = + _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); + const __m128i out_29_4 = + _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); + const __m128i out_29_5 = + _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); + const __m128i out_03_4 = + _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); + const __m128i out_03_5 = + _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); + const __m128i out_19_4 = + _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); + const __m128i out_19_5 = + _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); + const __m128i out_11_4 = + _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); + const __m128i out_11_5 = + _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); + const __m128i out_27_4 = + _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); + const __m128i out_27_5 = + _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); + const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); + const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); + const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); + const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); + const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); + const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); + const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); + const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); + const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); + const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); + const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); + const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); + const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); + const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); + const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); + const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); + // Combine + out[5] = _mm_packs_epi32(out_05_6, out_05_7); + out[21] = _mm_packs_epi32(out_21_6, out_21_7); + out[13] = _mm_packs_epi32(out_13_6, out_13_7); + out[29] = _mm_packs_epi32(out_29_6, out_29_7); + out[3] = _mm_packs_epi32(out_03_6, out_03_7); + out[19] = _mm_packs_epi32(out_19_6, out_19_7); + out[11] = _mm_packs_epi32(out_11_6, out_11_7); + out[27] = _mm_packs_epi32(out_27_6, out_27_7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], + &out[3], &out[19], &out[11], &out[27]); + if (overflow) { + if (pass == 0) + HIGH_FDCT32x32_2D_C(input, output_org, stride); + else + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } +#if FDCT32x32_HIGH_PRECISION + } else { + __m128i lstep1[64], lstep2[64], lstep3[64]; + __m128i u[32], v[32], sign[16]; + const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); + // start using 32-bit operations + // stage 3 + { + // expanding to 32-bit length priori to addition operations + lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero); + lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero); + lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero); + lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero); + lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero); + lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero); + lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero); + lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero); + lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero); + lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero); + lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero); + lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero); + lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero); + lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero); + lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero); + lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero); + lstep2[0] = _mm_madd_epi16(lstep2[0], kOne); + lstep2[1] = _mm_madd_epi16(lstep2[1], kOne); + lstep2[2] = _mm_madd_epi16(lstep2[2], kOne); + lstep2[3] = _mm_madd_epi16(lstep2[3], kOne); + lstep2[4] = _mm_madd_epi16(lstep2[4], kOne); + lstep2[5] = _mm_madd_epi16(lstep2[5], kOne); + lstep2[6] = _mm_madd_epi16(lstep2[6], kOne); + lstep2[7] = _mm_madd_epi16(lstep2[7], kOne); + lstep2[8] = _mm_madd_epi16(lstep2[8], kOne); + lstep2[9] = _mm_madd_epi16(lstep2[9], kOne); + lstep2[10] = _mm_madd_epi16(lstep2[10], kOne); + lstep2[11] = _mm_madd_epi16(lstep2[11], kOne); + lstep2[12] = _mm_madd_epi16(lstep2[12], kOne); + lstep2[13] = _mm_madd_epi16(lstep2[13], kOne); + lstep2[14] = _mm_madd_epi16(lstep2[14], kOne); + lstep2[15] = _mm_madd_epi16(lstep2[15], kOne); + + lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]); + lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]); + lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]); + lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]); + lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]); + lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]); + lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]); + lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]); + lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]); + lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]); + lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]); + lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]); + lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]); + lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]); + lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]); + lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]); + } + { + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); + lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); + lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); + lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); + lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); + lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); + lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); + lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); + lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); + } + { + lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero); + lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero); + lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero); + lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero); + lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero); + lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero); + lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero); + lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero); + lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero); + lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero); + lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero); + lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero); + lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero); + lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero); + lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero); + lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero); + lstep2[40] = _mm_madd_epi16(lstep2[40], kOne); + lstep2[41] = _mm_madd_epi16(lstep2[41], kOne); + lstep2[42] = _mm_madd_epi16(lstep2[42], kOne); + lstep2[43] = _mm_madd_epi16(lstep2[43], kOne); + lstep2[44] = _mm_madd_epi16(lstep2[44], kOne); + lstep2[45] = _mm_madd_epi16(lstep2[45], kOne); + lstep2[46] = _mm_madd_epi16(lstep2[46], kOne); + lstep2[47] = _mm_madd_epi16(lstep2[47], kOne); + lstep2[48] = _mm_madd_epi16(lstep2[48], kOne); + lstep2[49] = _mm_madd_epi16(lstep2[49], kOne); + lstep2[50] = _mm_madd_epi16(lstep2[50], kOne); + lstep2[51] = _mm_madd_epi16(lstep2[51], kOne); + lstep2[52] = _mm_madd_epi16(lstep2[52], kOne); + lstep2[53] = _mm_madd_epi16(lstep2[53], kOne); + lstep2[54] = _mm_madd_epi16(lstep2[54], kOne); + lstep2[55] = _mm_madd_epi16(lstep2[55], kOne); + + lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero); + lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero); + lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero); + lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero); + lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero); + lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero); + lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero); + lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero); + lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero); + lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero); + lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero); + lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero); + lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero); + lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero); + lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero); + lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero); + lstep1[32] = _mm_madd_epi16(lstep1[32], kOne); + lstep1[33] = _mm_madd_epi16(lstep1[33], kOne); + lstep1[34] = _mm_madd_epi16(lstep1[34], kOne); + lstep1[35] = _mm_madd_epi16(lstep1[35], kOne); + lstep1[36] = _mm_madd_epi16(lstep1[36], kOne); + lstep1[37] = _mm_madd_epi16(lstep1[37], kOne); + lstep1[38] = _mm_madd_epi16(lstep1[38], kOne); + lstep1[39] = _mm_madd_epi16(lstep1[39], kOne); + lstep1[56] = _mm_madd_epi16(lstep1[56], kOne); + lstep1[57] = _mm_madd_epi16(lstep1[57], kOne); + lstep1[58] = _mm_madd_epi16(lstep1[58], kOne); + lstep1[59] = _mm_madd_epi16(lstep1[59], kOne); + lstep1[60] = _mm_madd_epi16(lstep1[60], kOne); + lstep1[61] = _mm_madd_epi16(lstep1[61], kOne); + lstep1[62] = _mm_madd_epi16(lstep1[62], kOne); + lstep1[63] = _mm_madd_epi16(lstep1[63], kOne); + + lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); + lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); + + lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); + lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); + lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); + lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); + lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); + lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); + lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); + lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); + lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); + lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); + lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]); + lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]); + lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]); + lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]); + lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]); + lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]); + lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]); + lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]); + lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]); + lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]); + lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]); + lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]); + lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]); + lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]); + lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]); + lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); + lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); + lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); + lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); + lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); + } + + // stage 4 + { + // expanding to 32-bit length priori to addition operations + lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero); + lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero); + lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero); + lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero); + lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); + lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); + lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); + lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero); + lstep2[16] = _mm_madd_epi16(lstep2[16], kOne); + lstep2[17] = _mm_madd_epi16(lstep2[17], kOne); + lstep2[18] = _mm_madd_epi16(lstep2[18], kOne); + lstep2[19] = _mm_madd_epi16(lstep2[19], kOne); + lstep2[28] = _mm_madd_epi16(lstep2[28], kOne); + lstep2[29] = _mm_madd_epi16(lstep2[29], kOne); + lstep2[30] = _mm_madd_epi16(lstep2[30], kOne); + lstep2[31] = _mm_madd_epi16(lstep2[31], kOne); + + lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]); + lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]); + lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]); + lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]); + lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]); + lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]); + lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]); + lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]); + lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); + lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); + lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); + lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); + lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); + lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); + lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); + lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); + lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); + lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); + lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); + lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); + lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); + lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); + lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); + lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); + } + { + // to be continued... + // + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + + u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); + u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); + u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); + u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[0] = k_madd_epi32(u[0], k32_p16_m16); + v[1] = k_madd_epi32(u[1], k32_p16_m16); + v[2] = k_madd_epi32(u[2], k32_p16_m16); + v[3] = k_madd_epi32(u[3], k32_p16_m16); + v[4] = k_madd_epi32(u[0], k32_p16_p16); + v[5] = k_madd_epi32(u[1], k32_p16_p16); + v[6] = k_madd_epi32(u[2], k32_p16_p16); + v[7] = k_madd_epi32(u[3], k32_p16_p16); +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4], + &v[5], &v[6], &v[7], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + + lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + } + { + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); + u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); + u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); + u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); + u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); + u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); + u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); + u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); + u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); + u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); + u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); + u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); + u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); + u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]); + u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); + u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); + + v[0] = k_madd_epi32(u[0], k32_m08_p24); + v[1] = k_madd_epi32(u[1], k32_m08_p24); + v[2] = k_madd_epi32(u[2], k32_m08_p24); + v[3] = k_madd_epi32(u[3], k32_m08_p24); + v[4] = k_madd_epi32(u[4], k32_m08_p24); + v[5] = k_madd_epi32(u[5], k32_m08_p24); + v[6] = k_madd_epi32(u[6], k32_m08_p24); + v[7] = k_madd_epi32(u[7], k32_m08_p24); + v[8] = k_madd_epi32(u[8], k32_m24_m08); + v[9] = k_madd_epi32(u[9], k32_m24_m08); + v[10] = k_madd_epi32(u[10], k32_m24_m08); + v[11] = k_madd_epi32(u[11], k32_m24_m08); + v[12] = k_madd_epi32(u[12], k32_m24_m08); + v[13] = k_madd_epi32(u[13], k32_m24_m08); + v[14] = k_madd_epi32(u[14], k32_m24_m08); + v[15] = k_madd_epi32(u[15], k32_m24_m08); + v[16] = k_madd_epi32(u[12], k32_m08_p24); + v[17] = k_madd_epi32(u[13], k32_m08_p24); + v[18] = k_madd_epi32(u[14], k32_m08_p24); + v[19] = k_madd_epi32(u[15], k32_m08_p24); + v[20] = k_madd_epi32(u[8], k32_m08_p24); + v[21] = k_madd_epi32(u[9], k32_m08_p24); + v[22] = k_madd_epi32(u[10], k32_m08_p24); + v[23] = k_madd_epi32(u[11], k32_m08_p24); + v[24] = k_madd_epi32(u[4], k32_p24_p08); + v[25] = k_madd_epi32(u[5], k32_p24_p08); + v[26] = k_madd_epi32(u[6], k32_p24_p08); + v[27] = k_madd_epi32(u[7], k32_p24_p08); + v[28] = k_madd_epi32(u[0], k32_p24_p08); + v[29] = k_madd_epi32(u[1], k32_p24_p08); + v[30] = k_madd_epi32(u[2], k32_p24_p08); + v[31] = k_madd_epi32(u[3], k32_p24_p08); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 5 + { + lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]); + lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]); + lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]); + lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]); + lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]); + lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]); + lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]); + lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]); + } + { + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + + u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]); + u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]); + u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]); + u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]); + u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]); + u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]); + u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]); + u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]); + + // TODO(jingning): manually inline k_madd_epi32_ to further hide + // instruction latency. + v[0] = k_madd_epi32(u[0], k32_p16_p16); + v[1] = k_madd_epi32(u[1], k32_p16_p16); + v[2] = k_madd_epi32(u[2], k32_p16_p16); + v[3] = k_madd_epi32(u[3], k32_p16_p16); + v[4] = k_madd_epi32(u[0], k32_p16_m16); + v[5] = k_madd_epi32(u[1], k32_p16_m16); + v[6] = k_madd_epi32(u[2], k32_p16_m16); + v[7] = k_madd_epi32(u[3], k32_p16_m16); + v[8] = k_madd_epi32(u[4], k32_p24_p08); + v[9] = k_madd_epi32(u[5], k32_p24_p08); + v[10] = k_madd_epi32(u[6], k32_p24_p08); + v[11] = k_madd_epi32(u[7], k32_p24_p08); + v[12] = k_madd_epi32(u[4], k32_m08_p24); + v[13] = k_madd_epi32(u[5], k32_m08_p24); + v[14] = k_madd_epi32(u[6], k32_m08_p24); + v[15] = k_madd_epi32(u[7], k32_m08_p24); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_16( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm_cmplt_epi32(u[0], kZero); + sign[1] = _mm_cmplt_epi32(u[1], kZero); + sign[2] = _mm_cmplt_epi32(u[2], kZero); + sign[3] = _mm_cmplt_epi32(u[3], kZero); + sign[4] = _mm_cmplt_epi32(u[4], kZero); + sign[5] = _mm_cmplt_epi32(u[5], kZero); + sign[6] = _mm_cmplt_epi32(u[6], kZero); + sign[7] = _mm_cmplt_epi32(u[7], kZero); + + u[0] = _mm_sub_epi32(u[0], sign[0]); + u[1] = _mm_sub_epi32(u[1], sign[1]); + u[2] = _mm_sub_epi32(u[2], sign[2]); + u[3] = _mm_sub_epi32(u[3], sign[3]); + u[4] = _mm_sub_epi32(u[4], sign[4]); + u[5] = _mm_sub_epi32(u[5], sign[5]); + u[6] = _mm_sub_epi32(u[6], sign[6]); + u[7] = _mm_sub_epi32(u[7], sign[7]); + + u[0] = _mm_add_epi32(u[0], K32One); + u[1] = _mm_add_epi32(u[1], K32One); + u[2] = _mm_add_epi32(u[2], K32One); + u[3] = _mm_add_epi32(u[3], K32One); + u[4] = _mm_add_epi32(u[4], K32One); + u[5] = _mm_add_epi32(u[5], K32One); + u[6] = _mm_add_epi32(u[6], K32One); + u[7] = _mm_add_epi32(u[7], K32One); + + u[0] = _mm_srai_epi32(u[0], 2); + u[1] = _mm_srai_epi32(u[1], 2); + u[2] = _mm_srai_epi32(u[2], 2); + u[3] = _mm_srai_epi32(u[3], 2); + u[4] = _mm_srai_epi32(u[4], 2); + u[5] = _mm_srai_epi32(u[5], 2); + u[6] = _mm_srai_epi32(u[6], 2); + u[7] = _mm_srai_epi32(u[7], 2); + + // Combine + out[0] = _mm_packs_epi32(u[0], u[1]); + out[16] = _mm_packs_epi32(u[2], u[3]); + out[8] = _mm_packs_epi32(u[4], u[5]); + out[24] = _mm_packs_epi32(u[6], u[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); + + u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]); + u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]); + u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]); + u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]); + u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]); + u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]); + u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]); + u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]); + + v[0] = k_madd_epi32(u[0], k32_m08_p24); + v[1] = k_madd_epi32(u[1], k32_m08_p24); + v[2] = k_madd_epi32(u[2], k32_m08_p24); + v[3] = k_madd_epi32(u[3], k32_m08_p24); + v[4] = k_madd_epi32(u[4], k32_m24_m08); + v[5] = k_madd_epi32(u[5], k32_m24_m08); + v[6] = k_madd_epi32(u[6], k32_m24_m08); + v[7] = k_madd_epi32(u[7], k32_m24_m08); + v[8] = k_madd_epi32(u[4], k32_m08_p24); + v[9] = k_madd_epi32(u[5], k32_m08_p24); + v[10] = k_madd_epi32(u[6], k32_m08_p24); + v[11] = k_madd_epi32(u[7], k32_m08_p24); + v[12] = k_madd_epi32(u[0], k32_p24_p08); + v[13] = k_madd_epi32(u[1], k32_p24_p08); + v[14] = k_madd_epi32(u[2], k32_p24_p08); + v[15] = k_madd_epi32(u[3], k32_p24_p08); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_16( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + } + { + lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]); + lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]); + lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]); + lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]); + lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]); + lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]); + lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]); + lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]); + lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]); + lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]); + lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]); + lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]); + lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]); + lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]); + lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]); + lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]); + lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]); + lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]); + lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]); + lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]); + lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]); + lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]); + lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]); + lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]); + lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]); + lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]); + lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]); + lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]); + lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]); + lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]); + lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]); + lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]); + } + // stage 6 + { + const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); + const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); + const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); + const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); + + u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); + u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); + u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); + u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); + u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); + u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); + u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); + u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); + u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); + u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); + u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); + u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); + u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); + u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); + u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); + u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); + + v[0] = k_madd_epi32(u[0], k32_p28_p04); + v[1] = k_madd_epi32(u[1], k32_p28_p04); + v[2] = k_madd_epi32(u[2], k32_p28_p04); + v[3] = k_madd_epi32(u[3], k32_p28_p04); + v[4] = k_madd_epi32(u[4], k32_p12_p20); + v[5] = k_madd_epi32(u[5], k32_p12_p20); + v[6] = k_madd_epi32(u[6], k32_p12_p20); + v[7] = k_madd_epi32(u[7], k32_p12_p20); + v[8] = k_madd_epi32(u[8], k32_m20_p12); + v[9] = k_madd_epi32(u[9], k32_m20_p12); + v[10] = k_madd_epi32(u[10], k32_m20_p12); + v[11] = k_madd_epi32(u[11], k32_m20_p12); + v[12] = k_madd_epi32(u[12], k32_m04_p28); + v[13] = k_madd_epi32(u[13], k32_m04_p28); + v[14] = k_madd_epi32(u[14], k32_m04_p28); + v[15] = k_madd_epi32(u[15], k32_m04_p28); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_16( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + + sign[0] = _mm_cmplt_epi32(u[0], kZero); + sign[1] = _mm_cmplt_epi32(u[1], kZero); + sign[2] = _mm_cmplt_epi32(u[2], kZero); + sign[3] = _mm_cmplt_epi32(u[3], kZero); + sign[4] = _mm_cmplt_epi32(u[4], kZero); + sign[5] = _mm_cmplt_epi32(u[5], kZero); + sign[6] = _mm_cmplt_epi32(u[6], kZero); + sign[7] = _mm_cmplt_epi32(u[7], kZero); + + u[0] = _mm_sub_epi32(u[0], sign[0]); + u[1] = _mm_sub_epi32(u[1], sign[1]); + u[2] = _mm_sub_epi32(u[2], sign[2]); + u[3] = _mm_sub_epi32(u[3], sign[3]); + u[4] = _mm_sub_epi32(u[4], sign[4]); + u[5] = _mm_sub_epi32(u[5], sign[5]); + u[6] = _mm_sub_epi32(u[6], sign[6]); + u[7] = _mm_sub_epi32(u[7], sign[7]); + + u[0] = _mm_add_epi32(u[0], K32One); + u[1] = _mm_add_epi32(u[1], K32One); + u[2] = _mm_add_epi32(u[2], K32One); + u[3] = _mm_add_epi32(u[3], K32One); + u[4] = _mm_add_epi32(u[4], K32One); + u[5] = _mm_add_epi32(u[5], K32One); + u[6] = _mm_add_epi32(u[6], K32One); + u[7] = _mm_add_epi32(u[7], K32One); + + u[0] = _mm_srai_epi32(u[0], 2); + u[1] = _mm_srai_epi32(u[1], 2); + u[2] = _mm_srai_epi32(u[2], 2); + u[3] = _mm_srai_epi32(u[3], 2); + u[4] = _mm_srai_epi32(u[4], 2); + u[5] = _mm_srai_epi32(u[5], 2); + u[6] = _mm_srai_epi32(u[6], 2); + u[7] = _mm_srai_epi32(u[7], 2); + + out[4] = _mm_packs_epi32(u[0], u[1]); + out[20] = _mm_packs_epi32(u[2], u[3]); + out[12] = _mm_packs_epi32(u[4], u[5]); + out[28] = _mm_packs_epi32(u[6], u[7]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]); + lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]); + lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]); + lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]); + lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]); + lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]); + lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]); + lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]); + lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]); + lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]); + lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]); + lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]); + lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]); + lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]); + lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]); + lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]); + } + { + const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); + const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64); + const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); + const __m128i k32_m12_m20 = + pair_set_epi32(-cospi_12_64, -cospi_20_64); + const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); + const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); + + u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); + u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); + u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); + u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); + u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); + u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); + u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); + u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); + u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); + u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); + u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]); + u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]); + u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]); + u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]); + u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]); + u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]); + + v[0] = k_madd_epi32(u[0], k32_m04_p28); + v[1] = k_madd_epi32(u[1], k32_m04_p28); + v[2] = k_madd_epi32(u[2], k32_m04_p28); + v[3] = k_madd_epi32(u[3], k32_m04_p28); + v[4] = k_madd_epi32(u[4], k32_m28_m04); + v[5] = k_madd_epi32(u[5], k32_m28_m04); + v[6] = k_madd_epi32(u[6], k32_m28_m04); + v[7] = k_madd_epi32(u[7], k32_m28_m04); + v[8] = k_madd_epi32(u[8], k32_m20_p12); + v[9] = k_madd_epi32(u[9], k32_m20_p12); + v[10] = k_madd_epi32(u[10], k32_m20_p12); + v[11] = k_madd_epi32(u[11], k32_m20_p12); + v[12] = k_madd_epi32(u[12], k32_m12_m20); + v[13] = k_madd_epi32(u[13], k32_m12_m20); + v[14] = k_madd_epi32(u[14], k32_m12_m20); + v[15] = k_madd_epi32(u[15], k32_m12_m20); + v[16] = k_madd_epi32(u[12], k32_m20_p12); + v[17] = k_madd_epi32(u[13], k32_m20_p12); + v[18] = k_madd_epi32(u[14], k32_m20_p12); + v[19] = k_madd_epi32(u[15], k32_m20_p12); + v[20] = k_madd_epi32(u[8], k32_p12_p20); + v[21] = k_madd_epi32(u[9], k32_p12_p20); + v[22] = k_madd_epi32(u[10], k32_p12_p20); + v[23] = k_madd_epi32(u[11], k32_p12_p20); + v[24] = k_madd_epi32(u[4], k32_m04_p28); + v[25] = k_madd_epi32(u[5], k32_m04_p28); + v[26] = k_madd_epi32(u[6], k32_m04_p28); + v[27] = k_madd_epi32(u[7], k32_m04_p28); + v[28] = k_madd_epi32(u[0], k32_p28_p04); + v[29] = k_madd_epi32(u[1], k32_p28_p04); + v[30] = k_madd_epi32(u[2], k32_p28_p04); + v[31] = k_madd_epi32(u[3], k32_p28_p04); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + } + // stage 7 + { + const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64); + const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64); + const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64); + const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); + const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64); + const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64); + const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64); + const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64); + + u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); + u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); + u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); + u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); + u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); + u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); + u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); + u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); + u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); + u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); + u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]); + u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]); + u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]); + u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]); + u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]); + u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]); + + v[0] = k_madd_epi32(u[0], k32_p30_p02); + v[1] = k_madd_epi32(u[1], k32_p30_p02); + v[2] = k_madd_epi32(u[2], k32_p30_p02); + v[3] = k_madd_epi32(u[3], k32_p30_p02); + v[4] = k_madd_epi32(u[4], k32_p14_p18); + v[5] = k_madd_epi32(u[5], k32_p14_p18); + v[6] = k_madd_epi32(u[6], k32_p14_p18); + v[7] = k_madd_epi32(u[7], k32_p14_p18); + v[8] = k_madd_epi32(u[8], k32_p22_p10); + v[9] = k_madd_epi32(u[9], k32_p22_p10); + v[10] = k_madd_epi32(u[10], k32_p22_p10); + v[11] = k_madd_epi32(u[11], k32_p22_p10); + v[12] = k_madd_epi32(u[12], k32_p06_p26); + v[13] = k_madd_epi32(u[13], k32_p06_p26); + v[14] = k_madd_epi32(u[14], k32_p06_p26); + v[15] = k_madd_epi32(u[15], k32_p06_p26); + v[16] = k_madd_epi32(u[12], k32_m26_p06); + v[17] = k_madd_epi32(u[13], k32_m26_p06); + v[18] = k_madd_epi32(u[14], k32_m26_p06); + v[19] = k_madd_epi32(u[15], k32_m26_p06); + v[20] = k_madd_epi32(u[8], k32_m10_p22); + v[21] = k_madd_epi32(u[9], k32_m10_p22); + v[22] = k_madd_epi32(u[10], k32_m10_p22); + v[23] = k_madd_epi32(u[11], k32_m10_p22); + v[24] = k_madd_epi32(u[4], k32_m18_p14); + v[25] = k_madd_epi32(u[5], k32_m18_p14); + v[26] = k_madd_epi32(u[6], k32_m18_p14); + v[27] = k_madd_epi32(u[7], k32_m18_p14); + v[28] = k_madd_epi32(u[0], k32_m02_p30); + v[29] = k_madd_epi32(u[1], k32_m02_p30); + v[30] = k_madd_epi32(u[2], k32_m02_p30); + v[31] = k_madd_epi32(u[3], k32_m02_p30); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[2] = _mm_packs_epi32(u[0], u[1]); + out[18] = _mm_packs_epi32(u[2], u[3]); + out[10] = _mm_packs_epi32(u[4], u[5]); + out[26] = _mm_packs_epi32(u[6], u[7]); + out[6] = _mm_packs_epi32(u[8], u[9]); + out[22] = _mm_packs_epi32(u[10], u[11]); + out[14] = _mm_packs_epi32(u[12], u[13]); + out[30] = _mm_packs_epi32(u[14], u[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], + &out[6], &out[22], &out[14], &out[30]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]); + lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]); + lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]); + lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]); + lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]); + lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]); + lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]); + lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]); + lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]); + lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]); + lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]); + lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]); + lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]); + lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]); + lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]); + lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]); + lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]); + lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]); + lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]); + lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]); + lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]); + lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]); + lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]); + lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]); + lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]); + lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]); + lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]); + lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]); + lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]); + lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]); + lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]); + lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]); + } + // stage 8 + { + const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64); + const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64); + const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64); + const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64); + const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64); + const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64); + const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64); + const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64); + + u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); + u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); + u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); + u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); + u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); + u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); + u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); + u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); + u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); + u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); + u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]); + u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]); + u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]); + u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]); + u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]); + u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]); + + v[0] = k_madd_epi32(u[0], k32_p31_p01); + v[1] = k_madd_epi32(u[1], k32_p31_p01); + v[2] = k_madd_epi32(u[2], k32_p31_p01); + v[3] = k_madd_epi32(u[3], k32_p31_p01); + v[4] = k_madd_epi32(u[4], k32_p15_p17); + v[5] = k_madd_epi32(u[5], k32_p15_p17); + v[6] = k_madd_epi32(u[6], k32_p15_p17); + v[7] = k_madd_epi32(u[7], k32_p15_p17); + v[8] = k_madd_epi32(u[8], k32_p23_p09); + v[9] = k_madd_epi32(u[9], k32_p23_p09); + v[10] = k_madd_epi32(u[10], k32_p23_p09); + v[11] = k_madd_epi32(u[11], k32_p23_p09); + v[12] = k_madd_epi32(u[12], k32_p07_p25); + v[13] = k_madd_epi32(u[13], k32_p07_p25); + v[14] = k_madd_epi32(u[14], k32_p07_p25); + v[15] = k_madd_epi32(u[15], k32_p07_p25); + v[16] = k_madd_epi32(u[12], k32_m25_p07); + v[17] = k_madd_epi32(u[13], k32_m25_p07); + v[18] = k_madd_epi32(u[14], k32_m25_p07); + v[19] = k_madd_epi32(u[15], k32_m25_p07); + v[20] = k_madd_epi32(u[8], k32_m09_p23); + v[21] = k_madd_epi32(u[9], k32_m09_p23); + v[22] = k_madd_epi32(u[10], k32_m09_p23); + v[23] = k_madd_epi32(u[11], k32_m09_p23); + v[24] = k_madd_epi32(u[4], k32_m17_p15); + v[25] = k_madd_epi32(u[5], k32_m17_p15); + v[26] = k_madd_epi32(u[6], k32_m17_p15); + v[27] = k_madd_epi32(u[7], k32_m17_p15); + v[28] = k_madd_epi32(u[0], k32_m01_p31); + v[29] = k_madd_epi32(u[1], k32_m01_p31); + v[30] = k_madd_epi32(u[2], k32_m01_p31); + v[31] = k_madd_epi32(u[3], k32_m01_p31); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[1] = _mm_packs_epi32(u[0], u[1]); + out[17] = _mm_packs_epi32(u[2], u[3]); + out[9] = _mm_packs_epi32(u[4], u[5]); + out[25] = _mm_packs_epi32(u[6], u[7]); + out[7] = _mm_packs_epi32(u[8], u[9]); + out[23] = _mm_packs_epi32(u[10], u[11]); + out[15] = _mm_packs_epi32(u[12], u[13]); + out[31] = _mm_packs_epi32(u[14], u[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], + &out[7], &out[23], &out[15], &out[31]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64); + const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64); + const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64); + const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64); + const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64); + const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64); + const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64); + const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64); + + u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); + u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); + u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); + u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); + u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); + u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); + u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); + u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); + u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); + u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); + u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]); + u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]); + u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]); + u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]); + u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]); + u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]); + + v[0] = k_madd_epi32(u[0], k32_p27_p05); + v[1] = k_madd_epi32(u[1], k32_p27_p05); + v[2] = k_madd_epi32(u[2], k32_p27_p05); + v[3] = k_madd_epi32(u[3], k32_p27_p05); + v[4] = k_madd_epi32(u[4], k32_p11_p21); + v[5] = k_madd_epi32(u[5], k32_p11_p21); + v[6] = k_madd_epi32(u[6], k32_p11_p21); + v[7] = k_madd_epi32(u[7], k32_p11_p21); + v[8] = k_madd_epi32(u[8], k32_p19_p13); + v[9] = k_madd_epi32(u[9], k32_p19_p13); + v[10] = k_madd_epi32(u[10], k32_p19_p13); + v[11] = k_madd_epi32(u[11], k32_p19_p13); + v[12] = k_madd_epi32(u[12], k32_p03_p29); + v[13] = k_madd_epi32(u[13], k32_p03_p29); + v[14] = k_madd_epi32(u[14], k32_p03_p29); + v[15] = k_madd_epi32(u[15], k32_p03_p29); + v[16] = k_madd_epi32(u[12], k32_m29_p03); + v[17] = k_madd_epi32(u[13], k32_m29_p03); + v[18] = k_madd_epi32(u[14], k32_m29_p03); + v[19] = k_madd_epi32(u[15], k32_m29_p03); + v[20] = k_madd_epi32(u[8], k32_m13_p19); + v[21] = k_madd_epi32(u[9], k32_m13_p19); + v[22] = k_madd_epi32(u[10], k32_m13_p19); + v[23] = k_madd_epi32(u[11], k32_m13_p19); + v[24] = k_madd_epi32(u[4], k32_m21_p11); + v[25] = k_madd_epi32(u[5], k32_m21_p11); + v[26] = k_madd_epi32(u[6], k32_m21_p11); + v[27] = k_madd_epi32(u[7], k32_m21_p11); + v[28] = k_madd_epi32(u[0], k32_m05_p27); + v[29] = k_madd_epi32(u[1], k32_m05_p27); + v[30] = k_madd_epi32(u[2], k32_m05_p27); + v[31] = k_madd_epi32(u[3], k32_m05_p27); + +#if DCT_HIGH_BIT_DEPTH + overflow = k_check_epi32_overflow_32( + &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], + &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], + &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], + &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + u[0] = k_packs_epi64(v[0], v[1]); + u[1] = k_packs_epi64(v[2], v[3]); + u[2] = k_packs_epi64(v[4], v[5]); + u[3] = k_packs_epi64(v[6], v[7]); + u[4] = k_packs_epi64(v[8], v[9]); + u[5] = k_packs_epi64(v[10], v[11]); + u[6] = k_packs_epi64(v[12], v[13]); + u[7] = k_packs_epi64(v[14], v[15]); + u[8] = k_packs_epi64(v[16], v[17]); + u[9] = k_packs_epi64(v[18], v[19]); + u[10] = k_packs_epi64(v[20], v[21]); + u[11] = k_packs_epi64(v[22], v[23]); + u[12] = k_packs_epi64(v[24], v[25]); + u[13] = k_packs_epi64(v[26], v[27]); + u[14] = k_packs_epi64(v[28], v[29]); + u[15] = k_packs_epi64(v[30], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + v[0] = _mm_cmplt_epi32(u[0], kZero); + v[1] = _mm_cmplt_epi32(u[1], kZero); + v[2] = _mm_cmplt_epi32(u[2], kZero); + v[3] = _mm_cmplt_epi32(u[3], kZero); + v[4] = _mm_cmplt_epi32(u[4], kZero); + v[5] = _mm_cmplt_epi32(u[5], kZero); + v[6] = _mm_cmplt_epi32(u[6], kZero); + v[7] = _mm_cmplt_epi32(u[7], kZero); + v[8] = _mm_cmplt_epi32(u[8], kZero); + v[9] = _mm_cmplt_epi32(u[9], kZero); + v[10] = _mm_cmplt_epi32(u[10], kZero); + v[11] = _mm_cmplt_epi32(u[11], kZero); + v[12] = _mm_cmplt_epi32(u[12], kZero); + v[13] = _mm_cmplt_epi32(u[13], kZero); + v[14] = _mm_cmplt_epi32(u[14], kZero); + v[15] = _mm_cmplt_epi32(u[15], kZero); + + u[0] = _mm_sub_epi32(u[0], v[0]); + u[1] = _mm_sub_epi32(u[1], v[1]); + u[2] = _mm_sub_epi32(u[2], v[2]); + u[3] = _mm_sub_epi32(u[3], v[3]); + u[4] = _mm_sub_epi32(u[4], v[4]); + u[5] = _mm_sub_epi32(u[5], v[5]); + u[6] = _mm_sub_epi32(u[6], v[6]); + u[7] = _mm_sub_epi32(u[7], v[7]); + u[8] = _mm_sub_epi32(u[8], v[8]); + u[9] = _mm_sub_epi32(u[9], v[9]); + u[10] = _mm_sub_epi32(u[10], v[10]); + u[11] = _mm_sub_epi32(u[11], v[11]); + u[12] = _mm_sub_epi32(u[12], v[12]); + u[13] = _mm_sub_epi32(u[13], v[13]); + u[14] = _mm_sub_epi32(u[14], v[14]); + u[15] = _mm_sub_epi32(u[15], v[15]); + + v[0] = _mm_add_epi32(u[0], K32One); + v[1] = _mm_add_epi32(u[1], K32One); + v[2] = _mm_add_epi32(u[2], K32One); + v[3] = _mm_add_epi32(u[3], K32One); + v[4] = _mm_add_epi32(u[4], K32One); + v[5] = _mm_add_epi32(u[5], K32One); + v[6] = _mm_add_epi32(u[6], K32One); + v[7] = _mm_add_epi32(u[7], K32One); + v[8] = _mm_add_epi32(u[8], K32One); + v[9] = _mm_add_epi32(u[9], K32One); + v[10] = _mm_add_epi32(u[10], K32One); + v[11] = _mm_add_epi32(u[11], K32One); + v[12] = _mm_add_epi32(u[12], K32One); + v[13] = _mm_add_epi32(u[13], K32One); + v[14] = _mm_add_epi32(u[14], K32One); + v[15] = _mm_add_epi32(u[15], K32One); + + u[0] = _mm_srai_epi32(v[0], 2); + u[1] = _mm_srai_epi32(v[1], 2); + u[2] = _mm_srai_epi32(v[2], 2); + u[3] = _mm_srai_epi32(v[3], 2); + u[4] = _mm_srai_epi32(v[4], 2); + u[5] = _mm_srai_epi32(v[5], 2); + u[6] = _mm_srai_epi32(v[6], 2); + u[7] = _mm_srai_epi32(v[7], 2); + u[8] = _mm_srai_epi32(v[8], 2); + u[9] = _mm_srai_epi32(v[9], 2); + u[10] = _mm_srai_epi32(v[10], 2); + u[11] = _mm_srai_epi32(v[11], 2); + u[12] = _mm_srai_epi32(v[12], 2); + u[13] = _mm_srai_epi32(v[13], 2); + u[14] = _mm_srai_epi32(v[14], 2); + u[15] = _mm_srai_epi32(v[15], 2); + + out[5] = _mm_packs_epi32(u[0], u[1]); + out[21] = _mm_packs_epi32(u[2], u[3]); + out[13] = _mm_packs_epi32(u[4], u[5]); + out[29] = _mm_packs_epi32(u[6], u[7]); + out[3] = _mm_packs_epi32(u[8], u[9]); + out[19] = _mm_packs_epi32(u[10], u[11]); + out[11] = _mm_packs_epi32(u[12], u[13]); + out[27] = _mm_packs_epi32(u[14], u[15]); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], + &out[3], &out[19], &out[11], &out[27]); + if (overflow) { + HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } +#endif // FDCT32x32_HIGH_PRECISION + // Transpose the results, do it as four 8x8 transposes. + { + int transpose_block; + int16_t *output0 = &intermediate[column_start * 32]; + tran_low_t *output1 = &output_org[column_start * 32]; + for (transpose_block = 0; transpose_block < 4; ++transpose_block) { + __m128i *this_out = &out[8 * transpose_block]; + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + if (0 == pass) { + // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; + // TODO(cd): see quality impact of only doing + // output[j] = (output[j] + 1) >> 2; + // which would remove the code between here ... + __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); + __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); + __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); + __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); + __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); + __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); + __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); + __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); + tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); + tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); + tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); + tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); + tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); + tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); + tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); + tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); + // ... and here. + // PS: also change code in av1/encoder/av1_dct.c + tr2_0 = _mm_add_epi16(tr2_0, kOne); + tr2_1 = _mm_add_epi16(tr2_1, kOne); + tr2_2 = _mm_add_epi16(tr2_2, kOne); + tr2_3 = _mm_add_epi16(tr2_3, kOne); + tr2_4 = _mm_add_epi16(tr2_4, kOne); + tr2_5 = _mm_add_epi16(tr2_5, kOne); + tr2_6 = _mm_add_epi16(tr2_6, kOne); + tr2_7 = _mm_add_epi16(tr2_7, kOne); + tr2_0 = _mm_srai_epi16(tr2_0, 2); + tr2_1 = _mm_srai_epi16(tr2_1, 2); + tr2_2 = _mm_srai_epi16(tr2_2, 2); + tr2_3 = _mm_srai_epi16(tr2_3, 2); + tr2_4 = _mm_srai_epi16(tr2_4, 2); + tr2_5 = _mm_srai_epi16(tr2_5, 2); + tr2_6 = _mm_srai_epi16(tr2_6, 2); + tr2_7 = _mm_srai_epi16(tr2_7, 2); + } + // Note: even though all these stores are aligned, using the aligned + // intrinsic make the code slightly slower. + if (pass == 0) { + _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0); + _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1); + _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2); + _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3); + _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4); + _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5); + _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6); + _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7); + // Process next 8x8 + output0 += 8; + } else { + storeu_output(&tr2_0, (output1 + 0 * 32)); + storeu_output(&tr2_1, (output1 + 1 * 32)); + storeu_output(&tr2_2, (output1 + 2 * 32)); + storeu_output(&tr2_3, (output1 + 3 * 32)); + storeu_output(&tr2_4, (output1 + 4 * 32)); + storeu_output(&tr2_5, (output1 + 5 * 32)); + storeu_output(&tr2_6, (output1 + 6 * 32)); + storeu_output(&tr2_7, (output1 + 7 * 32)); + // Process next 8x8 + output1 += 8; + } + } + } + } + } +} // NOLINT + +#undef ADD_EPI16 +#undef SUB_EPI16 +#undef HIGH_FDCT32x32_2D_C +#undef HIGH_FDCT32x32_2D_ROWS_C diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c new file mode 100644 index 000000000..670f864d0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_config.h" + +#define FDCT32x32_2D_AVX2 aom_fdct32x32_rd_avx2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" +#undef FDCT32x32_2D_AVX2 +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT +#undef FDCT32x32_2D_AVX2 +#undef FDCT32x32_HIGH_PRECISION diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h new file mode 100644 index 000000000..d3aceae00 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H +#define AOM_DSP_X86_FWD_TXFM_AVX2_H + +#include "./aom_config.h" + +static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) { +#if CONFIG_HIGHBITDEPTH + const __m256i zero = _mm256_setzero_si256(); + const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); + + __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); + __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); + + __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); + __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); + + _mm256_storeu_si256((__m256i *)out, y0); + _mm256_storeu_si256((__m256i *)(out + 8), y1); +#else + _mm256_storeu_si256((__m256i *)out, *coeff); +#endif +} + +#endif // AOM_DSP_X86_FWD_TXFM_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h new file mode 100644 index 000000000..7bb1db70a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h @@ -0,0 +1,1014 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_ports/mem.h" + +// TODO(jingning) The high bit-depth functions need rework for performance. +// After we properly fix the high bit-depth function implementations, this +// file's dependency should be substantially simplified. +#if DCT_HIGH_BIT_DEPTH +#define ADD_EPI16 _mm_adds_epi16 +#define SUB_EPI16 _mm_subs_epi16 + +#else +#define ADD_EPI16 _mm_add_epi16 +#define SUB_EPI16 _mm_sub_epi16 +#endif + +void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { + // This 2D transform implements 4 vertical 1D transforms followed + // by 4 horizontal 1D transforms. The multiplies and adds are as given + // by Chen, Smith and Fralick ('77). The commands for moving the data + // around have been minimized by hand. + // For the purposes of the comments, the 16 inputs are referred to at i0 + // through iF (in raster order), intermediate variables are a0, b0, c0 + // through f, and correspond to the in-place computations mapped to input + // locations. The outputs, o0 through oF are labeled according to the + // output locations. + + // Constants + // These are the coefficients used for the multiplies. + // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), + // where cospi_N_64 = cos(N pi /64) + const __m128i k__cospi_A = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_B = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_C = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); + const __m128i k__cospi_D = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); + const __m128i k__cospi_E = + octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, + cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); + const __m128i k__cospi_F = + octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, + cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); + const __m128i k__cospi_G = + octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, + -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); + const __m128i k__cospi_H = + octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, + -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); + + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + // This second rounding constant saves doing some extra adds at the end + const __m128i k__DCT_CONST_ROUNDING2 = + _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); + const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); + __m128i in0, in1; +#if DCT_HIGH_BIT_DEPTH + __m128i cmp0, cmp1; + int test, overflow; +#endif + + // Load inputs. + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + in1 = _mm_unpacklo_epi64( + in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + in0 = _mm_unpacklo_epi64( + in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); +#if DCT_HIGH_BIT_DEPTH + // Check inputs small enough to use optimised code + cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)), + _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00))); + cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)), + _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00))); + test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1)); + if (test) { + aom_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + + // multiply by 16 to give some extra precision + in0 = _mm_slli_epi16(in0, 4); + in1 = _mm_slli_epi16(in1, 4); + // if (i == 0 && input[0]) input[0] += 1; + // add 1 to the upper left pixel if it is non-zero, which helps reduce + // the round-trip error + { + // The mask will only contain whether the first value is zero, all + // other comparison will fail as something shifted by 4 (above << 4) + // can never be equal to one. To increment in the non-zero case, we + // add the mask and one for the first element: + // - if zero, mask = -1, v = v - 1 + 1 = v + // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 + __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); + in0 = _mm_add_epi16(in0, mask); + in0 = _mm_add_epi16(in0, k__nonzero_bias_b); + } + // There are 4 total stages, alternating between an add/subtract stage + // followed by an multiply-and-add stage. + { + // Stage 1: Add/subtract + + // in0 = [i0 i1 i2 i3 iC iD iE iF] + // in1 = [i4 i5 i6 i7 i8 i9 iA iB] + const __m128i r0 = _mm_unpacklo_epi16(in0, in1); + const __m128i r1 = _mm_unpackhi_epi16(in0, in1); + // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] + // r1 = [iC i8 iD i9 iE iA iF iB] + const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); + const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); + // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] + // r3 = [iC i8 iD i9 iF iB iE iA] + + const __m128i t0 = _mm_add_epi16(r2, r3); + const __m128i t1 = _mm_sub_epi16(r2, r3); + // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] + // t1 = [aC a8 aD a9 aF aB aE aA] + + // Stage 2: multiply by constants (which gets us into 32 bits). + // The constants needed here are: + // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] + // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] + // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] + // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); + // Then add and right-shift to get back to 16-bit range + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // w0 = [b0 b1 b7 b6] + // w1 = [b8 b9 bF bE] + // w2 = [b4 b5 b3 b2] + // w3 = [bC bD bB bA] + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&x0, &x1); + if (overflow) { + aom_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // x0 = [b0 b1 b7 b6 b8 b9 bF bE] + // x1 = [b4 b5 b3 b2 bC bD bB bA] + in0 = _mm_shuffle_epi32(x0, 0xD8); + in1 = _mm_shuffle_epi32(x1, 0x8D); + // in0 = [b0 b1 b8 b9 b7 b6 bF bE] + // in1 = [b3 b2 bB bA b4 b5 bC bD] + } + { + // vertical DCTs finished. Now we do the horizontal DCTs. + // Stage 3: Add/subtract + + // t0 = [c0 c1 c8 c9 c4 c5 cC cD] + // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] + const __m128i t0 = ADD_EPI16(in0, in1); + const __m128i t1 = SUB_EPI16(in0, in1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&t0, &t1); + if (overflow) { + aom_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + + // Stage 4: multiply by constants (which gets us into 32 bits). + { + // The constants needed here are: + // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] + // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] + // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] + // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); + const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); + const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); + // Then add and right-shift to get back to 16-bit range + // but this combines the final right-shift as well to save operations + // This unusual rounding operations is to maintain bit-accurate + // compatibility with the c version of this function which has two + // rounding steps in a row. + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); + // w0 = [o0 o4 o8 oC] + // w1 = [o2 o6 oA oE] + // w2 = [o1 o5 o9 oD] + // w3 = [o3 o7 oB oF] + // remember the o's are numbered according to the correct output location + const __m128i x0 = _mm_packs_epi32(w0, w1); + const __m128i x1 = _mm_packs_epi32(w2, w3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&x0, &x1); + if (overflow) { + aom_highbd_fdct4x4_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // x0 = [o0 o4 o8 oC o2 o6 oA oE] + // x1 = [o1 o5 o9 oD o3 o7 oB oF] + const __m128i y0 = _mm_unpacklo_epi16(x0, x1); + const __m128i y1 = _mm_unpackhi_epi16(x0, x1); + // y0 = [o0 o1 o4 o5 o8 o9 oC oD] + // y1 = [o2 o3 o6 o7 oA oB oE oF] + in0 = _mm_unpacklo_epi32(y0, y1); + // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] + in1 = _mm_unpackhi_epi32(y0, y1); + // in1 = [o8 o9 oA oB oC oD oE oF] + } + } + } + // Post-condition (v + 1) >> 2 is now incorporated into previous + // add and right-shift commands. Only 2 store instructions needed + // because we are using the fact that 1/3 are stored just after 0/2. + storeu_output(&in0, output + 0 * 4); + storeu_output(&in1, output + 2 * 4); +} + +void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { + int pass; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + // Load input + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + // Pre-condition input (shift by two) + in0 = _mm_slli_epi16(in0, 2); + in1 = _mm_slli_epi16(in1, 2); + in2 = _mm_slli_epi16(in2, 2); + in3 = _mm_slli_epi16(in3, 2); + in4 = _mm_slli_epi16(in4, 2); + in5 = _mm_slli_epi16(in5, 2); + in6 = _mm_slli_epi16(in6, 2); + in7 = _mm_slli_epi16(in7, 2); + + // We do two passes, first the columns, then the rows. The results of the + // first pass are transposed so that the same column code can be reused. The + // results of the second pass are also transposed so that the rows (processed + // as columns) are put back in row positions. + for (pass = 0; pass < 2; pass++) { + // To store results of each pass before the transpose. + __m128i res0, res1, res2, res3, res4, res5, res6, res7; + // Add/subtract + const __m128i q0 = ADD_EPI16(in0, in7); + const __m128i q1 = ADD_EPI16(in1, in6); + const __m128i q2 = ADD_EPI16(in2, in5); + const __m128i q3 = ADD_EPI16(in3, in4); + const __m128i q4 = SUB_EPI16(in3, in4); + const __m128i q5 = SUB_EPI16(in2, in5); + const __m128i q6 = SUB_EPI16(in1, in6); + const __m128i q7 = SUB_EPI16(in0, in7); +#if DCT_HIGH_BIT_DEPTH + if (pass == 1) { + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } + } +#endif // DCT_HIGH_BIT_DEPTH + // Work on first four results + { + // Add/subtract + const __m128i r0 = ADD_EPI16(q0, q3); + const __m128i r1 = ADD_EPI16(q1, q2); + const __m128i r2 = SUB_EPI16(q1, q2); + const __m128i r3 = SUB_EPI16(q0, q3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res0 = _mm_packs_epi32(w0, w1); + res4 = _mm_packs_epi32(w2, w3); + res2 = _mm_packs_epi32(w4, w5); + res6 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us into 32bits + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&r0, &r1); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // Add/subtract + const __m128i x0 = ADD_EPI16(q4, r0); + const __m128i x1 = SUB_EPI16(q4, r0); + const __m128i x2 = SUB_EPI16(q7, r1); + const __m128i x3 = ADD_EPI16(q7, r1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us into 32bits + { + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res1 = _mm_packs_epi32(w0, w1); + res7 = _mm_packs_epi32(w2, w3); + res5 = _mm_packs_epi32(w4, w5); + res3 = _mm_packs_epi32(w6, w7); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3); + if (overflow) { + aom_highbd_fdct8x8_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + } + // Transpose the 8x8. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } + // Post-condition output and store it + { + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); + in0 = _mm_sub_epi16(in0, sign_in0); + in1 = _mm_sub_epi16(in1, sign_in1); + in2 = _mm_sub_epi16(in2, sign_in2); + in3 = _mm_sub_epi16(in3, sign_in3); + in4 = _mm_sub_epi16(in4, sign_in4); + in5 = _mm_sub_epi16(in5, sign_in5); + in6 = _mm_sub_epi16(in6, sign_in6); + in7 = _mm_sub_epi16(in7, sign_in7); + in0 = _mm_srai_epi16(in0, 1); + in1 = _mm_srai_epi16(in1, 1); + in2 = _mm_srai_epi16(in2, 1); + in3 = _mm_srai_epi16(in3, 1); + in4 = _mm_srai_epi16(in4, 1); + in5 = _mm_srai_epi16(in5, 1); + in6 = _mm_srai_epi16(in6, 1); + in7 = _mm_srai_epi16(in7, 1); + // store results + store_output(&in0, (output + 0 * 8)); + store_output(&in1, (output + 1 * 8)); + store_output(&in2, (output + 2 * 8)); + store_output(&in3, (output + 3 * 8)); + store_output(&in4, (output + 4 * 8)); + store_output(&in5, (output + 5 * 8)); + store_output(&in6, (output + 6 * 8)); + store_output(&in7, (output + 7 * 8)); + } +} + +void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we transpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + int pass; + // We need an intermediate buffer between passes. + DECLARE_ALIGNED(16, int16_t, intermediate[256]); + const int16_t *in = input; + int16_t *out0 = intermediate; + tran_low_t *out1 = output; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kOne = _mm_set1_epi16(1); + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + // We process eight columns (transposed rows in second pass) at a time. + int column_start; +#if DCT_HIGH_BIT_DEPTH + int overflow; +#endif + for (column_start = 0; column_start < 16; column_start += 8) { + __m128i in00, in01, in02, in03, in04, in05, in06, in07; + __m128i in08, in09, in10, in11, in12, in13, in14, in15; + __m128i input0, input1, input2, input3, input4, input5, input6, input7; + __m128i step1_0, step1_1, step1_2, step1_3; + __m128i step1_4, step1_5, step1_6, step1_7; + __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + __m128i step3_0, step3_1, step3_2, step3_3; + __m128i step3_4, step3_5, step3_6, step3_7; + __m128i res00, res01, res02, res03, res04, res05, res06, res07; + __m128i res08, res09, res10, res11, res12, res13, res14, res15; + // Load and pre-condition input. + if (0 == pass) { + in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); + in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); + in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); + in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); + in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); + in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); + in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); + in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); + in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); + in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); + in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); + in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); + in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); + in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); + in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); + in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); + // x = x << 2 + in00 = _mm_slli_epi16(in00, 2); + in01 = _mm_slli_epi16(in01, 2); + in02 = _mm_slli_epi16(in02, 2); + in03 = _mm_slli_epi16(in03, 2); + in04 = _mm_slli_epi16(in04, 2); + in05 = _mm_slli_epi16(in05, 2); + in06 = _mm_slli_epi16(in06, 2); + in07 = _mm_slli_epi16(in07, 2); + in08 = _mm_slli_epi16(in08, 2); + in09 = _mm_slli_epi16(in09, 2); + in10 = _mm_slli_epi16(in10, 2); + in11 = _mm_slli_epi16(in11, 2); + in12 = _mm_slli_epi16(in12, 2); + in13 = _mm_slli_epi16(in13, 2); + in14 = _mm_slli_epi16(in14, 2); + in15 = _mm_slli_epi16(in15, 2); + } else { + in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); + in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); + in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); + in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); + in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); + in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); + in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); + in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); + in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); + in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); + in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); + in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); + in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); + in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); + in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); + in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); + // x = (x + 1) >> 2 + in00 = _mm_add_epi16(in00, kOne); + in01 = _mm_add_epi16(in01, kOne); + in02 = _mm_add_epi16(in02, kOne); + in03 = _mm_add_epi16(in03, kOne); + in04 = _mm_add_epi16(in04, kOne); + in05 = _mm_add_epi16(in05, kOne); + in06 = _mm_add_epi16(in06, kOne); + in07 = _mm_add_epi16(in07, kOne); + in08 = _mm_add_epi16(in08, kOne); + in09 = _mm_add_epi16(in09, kOne); + in10 = _mm_add_epi16(in10, kOne); + in11 = _mm_add_epi16(in11, kOne); + in12 = _mm_add_epi16(in12, kOne); + in13 = _mm_add_epi16(in13, kOne); + in14 = _mm_add_epi16(in14, kOne); + in15 = _mm_add_epi16(in15, kOne); + in00 = _mm_srai_epi16(in00, 2); + in01 = _mm_srai_epi16(in01, 2); + in02 = _mm_srai_epi16(in02, 2); + in03 = _mm_srai_epi16(in03, 2); + in04 = _mm_srai_epi16(in04, 2); + in05 = _mm_srai_epi16(in05, 2); + in06 = _mm_srai_epi16(in06, 2); + in07 = _mm_srai_epi16(in07, 2); + in08 = _mm_srai_epi16(in08, 2); + in09 = _mm_srai_epi16(in09, 2); + in10 = _mm_srai_epi16(in10, 2); + in11 = _mm_srai_epi16(in11, 2); + in12 = _mm_srai_epi16(in12, 2); + in13 = _mm_srai_epi16(in13, 2); + in14 = _mm_srai_epi16(in14, 2); + in15 = _mm_srai_epi16(in15, 2); + } + in += 8; + // Calculate input for the first 8 results. + { + input0 = ADD_EPI16(in00, in15); + input1 = ADD_EPI16(in01, in14); + input2 = ADD_EPI16(in02, in13); + input3 = ADD_EPI16(in03, in12); + input4 = ADD_EPI16(in04, in11); + input5 = ADD_EPI16(in05, in10); + input6 = ADD_EPI16(in06, in09); + input7 = ADD_EPI16(in07, in08); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3, + &input4, &input5, &input6, &input7); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Calculate input for the next 8 results. + { + step1_0 = SUB_EPI16(in07, in08); + step1_1 = SUB_EPI16(in06, in09); + step1_2 = SUB_EPI16(in05, in10); + step1_3 = SUB_EPI16(in04, in11); + step1_4 = SUB_EPI16(in03, in12); + step1_5 = SUB_EPI16(in02, in13); + step1_6 = SUB_EPI16(in01, in14); + step1_7 = SUB_EPI16(in00, in15); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, + &step1_4, &step1_5, &step1_6, &step1_7); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // Work on the first eight values; fdct8(input, even_results); + { + // Add/subtract + const __m128i q0 = ADD_EPI16(input0, input7); + const __m128i q1 = ADD_EPI16(input1, input6); + const __m128i q2 = ADD_EPI16(input2, input5); + const __m128i q3 = ADD_EPI16(input3, input4); + const __m128i q4 = SUB_EPI16(input3, input4); + const __m128i q5 = SUB_EPI16(input2, input5); + const __m128i q6 = SUB_EPI16(input1, input6); + const __m128i q7 = SUB_EPI16(input0, input7); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Work on first four results + { + // Add/subtract + const __m128i r0 = ADD_EPI16(q0, q3); + const __m128i r1 = ADD_EPI16(q1, q2); + const __m128i r2 = SUB_EPI16(q1, q2); + const __m128i r3 = SUB_EPI16(q0, q3); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i r0 = + mult_round_shift(&d0, &d1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + const __m128i r1 = + mult_round_shift(&d0, &d1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x2(&r0, &r1); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + { + // Add/subtract + const __m128i x0 = ADD_EPI16(q4, r0); + const __m128i x1 = SUB_EPI16(q4, r0); + const __m128i x2 = SUB_EPI16(q7, r1); + const __m128i x3 = ADD_EPI16(q7, r1); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us + // into 32 bits. + { + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&res02, &res14, &res10, &res06); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + } + } + // Work on the next eight values; step1 -> odd_results + { + // step 2 + { + const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); + const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); + const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); + const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); + step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 3 + { + step3_0 = ADD_EPI16(step1_0, step2_3); + step3_1 = ADD_EPI16(step1_1, step2_2); + step3_2 = SUB_EPI16(step1_1, step2_2); + step3_3 = SUB_EPI16(step1_0, step2_3); + step3_4 = SUB_EPI16(step1_7, step2_4); + step3_5 = SUB_EPI16(step1_6, step2_5); + step3_6 = ADD_EPI16(step1_6, step2_5); + step3_7 = ADD_EPI16(step1_7, step2_4); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3, + &step3_4, &step3_5, &step3_6, &step3_7); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 4 + { + const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); + const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); + const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); + const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); + step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 5 + { + step1_0 = ADD_EPI16(step3_0, step2_1); + step1_1 = SUB_EPI16(step3_0, step2_1); + step1_2 = ADD_EPI16(step3_3, step2_2); + step1_3 = SUB_EPI16(step3_3, step2_2); + step1_4 = SUB_EPI16(step3_4, step2_5); + step1_5 = ADD_EPI16(step3_4, step2_5); + step1_6 = SUB_EPI16(step3_7, step2_6); + step1_7 = ADD_EPI16(step3_7, step2_6); +#if DCT_HIGH_BIT_DEPTH + overflow = + check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, + &step1_4, &step1_5, &step1_6, &step1_7); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + // step 6 + { + const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); + const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); + const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); + const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); + res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); + const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); + const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); + const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); + res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); + res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06, + &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); +#if DCT_HIGH_BIT_DEPTH + overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03); + if (overflow) { + aom_highbd_fdct16x16_c(input, output, stride); + return; + } +#endif // DCT_HIGH_BIT_DEPTH + } + } + // Transpose the results, do it as two 8x8 transposes. + transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05, + &res06, &res07, pass, out0, out1); + transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13, + &res14, &res15, pass, out0 + 8, out1 + 8); + if (pass == 0) { + out0 += 8 * 16; + } else { + out1 += 8 * 16; + } + } + // Setup in/out for next pass. + in = intermediate; + } +} + +#undef ADD_EPI16 +#undef SUB_EPI16 diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c new file mode 100644 index 000000000..a337e618d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/fwd_txfm_sse2.h" + +void aom_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { + __m128i in0, in1; + __m128i tmp; + const __m128i zero = _mm_setzero_si128(); + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); + in1 = _mm_unpacklo_epi64( + in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); + in0 = _mm_unpacklo_epi64( + in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); + + tmp = _mm_add_epi16(in0, in1); + in0 = _mm_unpacklo_epi16(zero, tmp); + in1 = _mm_unpackhi_epi16(zero, tmp); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(tmp, zero); + in1 = _mm_unpackhi_epi32(tmp, zero); + + tmp = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(tmp, 8); + + in1 = _mm_add_epi32(tmp, in0); + in0 = _mm_slli_epi32(in1, 1); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in0); +} + +void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); + __m128i u0, u1, sum; + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); + in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); + in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + sum = _mm_add_epi16(u0, u1); + + in0 = _mm_add_epi16(in0, in1); + in2 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, in0); + + u0 = _mm_setzero_si128(); + sum = _mm_add_epi16(sum, in2); + + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); +} + +void aom_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, + int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 2; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8)); + + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0)); + in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8)); + + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + input += 8 * stride; + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 1); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); +} + +void aom_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, + int stride) { + __m128i in0, in1, in2, in3; + __m128i u0, u1; + __m128i sum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; ++i) { + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + in0 = _mm_load_si128((const __m128i *)(input + 0)); + in1 = _mm_load_si128((const __m128i *)(input + 8)); + in2 = _mm_load_si128((const __m128i *)(input + 16)); + in3 = _mm_load_si128((const __m128i *)(input + 24)); + + input += stride; + sum = _mm_add_epi16(sum, u1); + u0 = _mm_add_epi16(in0, in1); + u1 = _mm_add_epi16(in2, in3); + sum = _mm_add_epi16(sum, u0); + + sum = _mm_add_epi16(sum, u1); + } + + u0 = _mm_setzero_si128(); + in0 = _mm_unpacklo_epi16(u0, sum); + in1 = _mm_unpackhi_epi16(u0, sum); + in0 = _mm_srai_epi32(in0, 16); + in1 = _mm_srai_epi32(in1, 16); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_unpacklo_epi32(sum, u0); + in1 = _mm_unpackhi_epi32(sum, u0); + + sum = _mm_add_epi32(in0, in1); + in0 = _mm_srli_si128(sum, 8); + + in1 = _mm_add_epi32(sum, in0); + in1 = _mm_srai_epi32(in1, 3); + output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); +} + +#define DCT_HIGH_BIT_DEPTH 0 +#define FDCT4x4_2D aom_fdct4x4_sse2 +#define FDCT8x8_2D aom_fdct8x8_sse2 +#define FDCT16x16_2D aom_fdct16x16_sse2 +#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" +#undef FDCT4x4_2D +#undef FDCT8x8_2D +#undef FDCT16x16_2D + +#define FDCT32x32_2D aom_fdct32x32_rd_sse2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D aom_fdct32x32_sse2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION +#undef DCT_HIGH_BIT_DEPTH + +#if CONFIG_HIGHBITDEPTH +#define DCT_HIGH_BIT_DEPTH 1 +#define FDCT4x4_2D aom_highbd_fdct4x4_sse2 +#define FDCT8x8_2D aom_highbd_fdct8x8_sse2 +#define FDCT16x16_2D aom_highbd_fdct16x16_sse2 +#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT +#undef FDCT4x4_2D +#undef FDCT8x8_2D +#undef FDCT16x16_2D + +#define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2 +#define FDCT32x32_HIGH_PRECISION 0 +#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION + +#define FDCT32x32_2D aom_highbd_fdct32x32_sse2 +#define FDCT32x32_HIGH_PRECISION 1 +#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT +#undef FDCT32x32_2D +#undef FDCT32x32_HIGH_PRECISION +#undef DCT_HIGH_BIT_DEPTH +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h new file mode 100644 index 000000000..26b2db2e0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#define AOM_DSP_X86_FWD_TXFM_SSE2_H_ + +#include "aom_dsp/x86/txfm_common_intrin.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define pair_set_epi32(a, b) \ + _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) + +static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { + __m128i buf0, buf1; + buf0 = _mm_mul_epu32(a, b); + a = _mm_srli_epi64(a, 32); + b = _mm_srli_epi64(b, 32); + buf1 = _mm_mul_epu32(a, b); + return _mm_add_epi64(buf0, buf1); +} + +static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { + __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); + __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); + return _mm_unpacklo_epi64(buf0, buf1); +} + +static INLINE int check_epi16_overflow_x2(const __m128i *preg0, + const __m128i *preg1) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16(0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + cmp0 = _mm_or_si128(cmp0, cmp1); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3) { + const __m128i max_overflow = _mm_set1_epi16(0x7fff); + const __m128i min_overflow = _mm_set1_epi16(0x8000); + __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), + _mm_cmpeq_epi16(*preg0, min_overflow)); + __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), + _mm_cmpeq_epi16(*preg1, min_overflow)); + __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), + _mm_cmpeq_epi16(*preg2, min_overflow)); + __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), + _mm_cmpeq_epi16(*preg3, min_overflow)); + cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); + return _mm_movemask_epi8(cmp0); +} + +static INLINE int check_epi16_overflow_x8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x12( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + } + return res0 + res1; +} + +static INLINE int check_epi16_overflow_x32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31) { + int res0, res1; + res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); + res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); + if (!res0) { + res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); + if (!res1) { + res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); + if (!res0) { + res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); + if (!res1) { + res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); + if (!res0) { + res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); + if (!res1) + res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); + } + } + } + } + } + return res0 + res1; +} + +static INLINE int k_check_epi32_overflow_4(const __m128i *preg0, + const __m128i *preg1, + const __m128i *preg2, + const __m128i *preg3, + const __m128i *zero) { + __m128i minus_one = _mm_set1_epi32(-1); + // Check for overflows + __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1); + __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1); + __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1); + __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1); + __m128i reg0_top_dwords = + _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg1_top_dwords = + _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg2_top_dwords = + _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i reg3_top_dwords = + _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1)); + __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords); + __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords); + __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero); + __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero); + __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one); + __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one); + int overflow_01 = + _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01)); + int overflow_23 = + _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23)); + return (overflow_01 + overflow_23); +} + +static INLINE int k_check_epi32_overflow_8( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); + } + return overflow; +} + +static INLINE int k_check_epi32_overflow_16( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); + } + } + } + return overflow; +} + +static INLINE int k_check_epi32_overflow_32( + const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, + const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, + const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, + const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, + const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, + const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, + const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, + const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, + const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, + const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, + const __m128i *preg30, const __m128i *preg31, const __m128i *zero) { + int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero); + if (!overflow) { + overflow = + k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg24, preg25, preg26, + preg27, zero); + if (!overflow) { + overflow = k_check_epi32_overflow_4(preg28, preg29, preg30, + preg31, zero); + } + } + } + } + } + } + } + return overflow; +} + +static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { +#if CONFIG_HIGHBITDEPTH + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); +#else + _mm_store_si128((__m128i *)(dst_ptr), *poutput); +#endif // CONFIG_HIGHBITDEPTH +} + +static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, + const __m128i *pmultiplier, + const __m128i *prounding, int shift) { + const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier); + const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier); + const __m128i v0 = _mm_add_epi32(u0, *prounding); + const __m128i v1 = _mm_add_epi32(u1, *prounding); + const __m128i w0 = _mm_srai_epi32(v0, shift); + const __m128i w1 = _mm_srai_epi32(v1, shift); + return _mm_packs_epi32(w0, w1); +} + +static INLINE void transpose_and_output8x8( + const __m128i *pin00, const __m128i *pin01, const __m128i *pin02, + const __m128i *pin03, const __m128i *pin04, const __m128i *pin05, + const __m128i *pin06, const __m128i *pin07, int pass, int16_t *out0_ptr, + tran_low_t *out1_ptr) { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01); + const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03); + const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01); + const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03); + const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05); + const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07); + const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05); + const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + if (pass == 0) { + _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0); + _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1); + _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2); + _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3); + _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4); + _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5); + _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6); + _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7); + } else { + storeu_output(&tr2_0, (out1_ptr + 0 * 16)); + storeu_output(&tr2_1, (out1_ptr + 1 * 16)); + storeu_output(&tr2_2, (out1_ptr + 2 * 16)); + storeu_output(&tr2_3, (out1_ptr + 3 * 16)); + storeu_output(&tr2_4, (out1_ptr + 4 * 16)); + storeu_output(&tr2_5, (out1_ptr + 5 * 16)); + storeu_output(&tr2_6, (out1_ptr + 6 * 16)); + storeu_output(&tr2_7, (out1_ptr + 7 * 16)); + } +} + +void fdct32_8col(__m128i *in0, __m128i *in1); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm new file mode 100644 index 000000000..8fa1c04d0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -0,0 +1,204 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +; This file provides SSSE3 version of the forward transformation. Part +; of the macro definitions are originally derived from the ffmpeg project. +; The current version applies to x86 64-bit only. + +SECTION_RODATA + +pw_11585x2: times 8 dw 23170 +pd_8192: times 4 dd 8192 + +%macro TRANSFORM_COEFFS 2 +pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 +pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1 +%endmacro + +TRANSFORM_COEFFS 11585, 11585 +TRANSFORM_COEFFS 15137, 6270 +TRANSFORM_COEFFS 16069, 3196 +TRANSFORM_COEFFS 9102, 13623 + +SECTION .text + +%if ARCH_X86_64 +%macro SUM_SUB 3 + psubw m%3, m%1, m%2 + paddw m%1, m%2 + SWAP %2, %3 +%endmacro + +; butterfly operation +%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 + pmaddwd m%1, m%3, %5 + pmaddwd m%2, m%3, %6 + paddd m%1, %4 + paddd m%2, %4 + psrad m%1, 14 + psrad m%2, 14 +%endmacro + +%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 + punpckhwd m%6, m%2, m%1 + MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4] + punpcklwd m%2, m%1 + MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4] + packssdw m%1, m%7 + packssdw m%2, m%6 +%endmacro + +; matrix transpose +%macro INTERLEAVE_2X 4 + punpckh%1 m%4, m%2, m%3 + punpckl%1 m%2, m%3 + SWAP %3, %4 +%endmacro + +%macro TRANSPOSE8X8 9 + INTERLEAVE_2X wd, %1, %2, %9 + INTERLEAVE_2X wd, %3, %4, %9 + INTERLEAVE_2X wd, %5, %6, %9 + INTERLEAVE_2X wd, %7, %8, %9 + + INTERLEAVE_2X dq, %1, %3, %9 + INTERLEAVE_2X dq, %2, %4, %9 + INTERLEAVE_2X dq, %5, %7, %9 + INTERLEAVE_2X dq, %6, %8, %9 + + INTERLEAVE_2X qdq, %1, %5, %9 + INTERLEAVE_2X qdq, %3, %7, %9 + INTERLEAVE_2X qdq, %2, %6, %9 + INTERLEAVE_2X qdq, %4, %8, %9 + + SWAP %2, %5 + SWAP %4, %7 +%endmacro + +; 1D forward 8x8 DCT transform +%macro FDCT8_1D 1 + SUM_SUB 0, 7, 9 + SUM_SUB 1, 6, 9 + SUM_SUB 2, 5, 9 + SUM_SUB 3, 4, 9 + + SUM_SUB 0, 3, 9 + SUM_SUB 1, 2, 9 + SUM_SUB 6, 5, 9 +%if %1 == 0 + SUM_SUB 0, 1, 9 +%endif + + BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 + + pmulhrsw m6, m12 + pmulhrsw m5, m12 +%if %1 == 0 + pmulhrsw m0, m12 + pmulhrsw m1, m12 +%else + BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 + SWAP 0, 1 +%endif + + SUM_SUB 4, 5, 9 + SUM_SUB 7, 6, 9 + BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10 + BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10 + SWAP 1, 4 + SWAP 3, 6 +%endmacro + +%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2 + psraw m%3, m%1, 15 + psraw m%4, m%2, 15 + psubw m%1, m%3 + psubw m%2, m%4 + psraw m%1, 1 + psraw m%2, 1 +%endmacro + +%macro STORE_OUTPUT 2 ; index, result +%if CONFIG_HIGHBITDEPTH + ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + ; _mm_store_si128((__m128i *)(dst_ptr), out0); + ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + pxor m11, m11 + pcmpgtw m11, m%2 + movdqa m12, m%2 + punpcklwd m%2, m11 + punpckhwd m12, m11 + mova [outputq + 4*%1 + 0], m%2 + mova [outputq + 4*%1 + 16], m12 +%else + mova [outputq + 2*%1], m%2 +%endif +%endmacro + +INIT_XMM ssse3 +cglobal fdct8x8, 3, 5, 13, input, output, stride + + mova m8, [pd_8192] + mova m12, [pw_11585x2] + + lea r3, [2 * strideq] + lea r4, [4 * strideq] + mova m0, [inputq] + mova m1, [inputq + r3] + lea inputq, [inputq + r4] + mova m2, [inputq] + mova m3, [inputq + r3] + lea inputq, [inputq + r4] + mova m4, [inputq] + mova m5, [inputq + r3] + lea inputq, [inputq + r4] + mova m6, [inputq] + mova m7, [inputq + r3] + + ; left shift by 2 to increase forward transformation precision + psllw m0, 2 + psllw m1, 2 + psllw m2, 2 + psllw m3, 2 + psllw m4, 2 + psllw m5, 2 + psllw m6, 2 + psllw m7, 2 + + ; column transform + FDCT8_1D 0 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + FDCT8_1D 1 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + + DIVIDE_ROUND_2X 0, 1, 9, 10 + DIVIDE_ROUND_2X 2, 3, 9, 10 + DIVIDE_ROUND_2X 4, 5, 9, 10 + DIVIDE_ROUND_2X 6, 7, 9, 10 + + STORE_OUTPUT 0, 0 + STORE_OUTPUT 8, 1 + STORE_OUTPUT 16, 2 + STORE_OUTPUT 24, 3 + STORE_OUTPUT 32, 4 + STORE_OUTPUT 40, 5 + STORE_OUTPUT 48, 6 + STORE_OUTPUT 56, 7 + + RET +%endif diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm new file mode 100644 index 000000000..60446b086 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm @@ -0,0 +1,349 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref, +; int ref_stride, +; unsigned char *src, +; int src_stride, +; unsigned int height, +; int *sum, +; unsigned int *sumsquared) +global sym(aom_half_horiz_vert_variance16x_h_sse2) PRIVATE +sym(aom_half_horiz_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref + + mov rdi, arg(2) ;src + movsxd rcx, dword ptr arg(4) ;height + movsxd rax, dword ptr arg(1) ;ref_stride + movsxd rdx, dword ptr arg(3) ;src_stride + + pxor xmm0, xmm0 ; + + movdqu xmm5, XMMWORD PTR [rsi] + movdqu xmm3, XMMWORD PTR [rsi+1] + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 + + lea rsi, [rsi + rax] + +aom_half_horiz_vert_variance16x_h_1: + movdqu xmm1, XMMWORD PTR [rsi] ; + movdqu xmm2, XMMWORD PTR [rsi+1] ; + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 + + pavgb xmm5, xmm1 ; xmm = vertical average of the above + + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm4, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + psubw xmm5, xmm3 ; xmm5 -= xmm3 + + movq xmm3, QWORD PTR [rdi+8] + punpcklbw xmm3, xmm0 + psubw xmm4, xmm3 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm1 ; save xmm1 for use on the next row + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz aom_half_horiz_vert_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_half_vert_variance16x_h_sse2(unsigned char *ref, +; int ref_stride, +; unsigned char *src, +; int src_stride, +; unsigned int height, +; int *sum, +; unsigned int *sumsquared) +global sym(aom_half_vert_variance16x_h_sse2) PRIVATE +sym(aom_half_vert_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref + + mov rdi, arg(2) ;src + movsxd rcx, dword ptr arg(4) ;height + movsxd rax, dword ptr arg(1) ;ref_stride + movsxd rdx, dword ptr arg(3) ;src_stride + + movdqu xmm5, XMMWORD PTR [rsi] + lea rsi, [rsi + rax ] + pxor xmm0, xmm0 + +aom_half_vert_variance16x_h_1: + movdqu xmm3, XMMWORD PTR [rsi] + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm4, xmm5 + punpcklbw xmm5, xmm0 + punpckhbw xmm4, xmm0 + + movq xmm2, QWORD PTR [rdi] + punpcklbw xmm2, xmm0 + psubw xmm5, xmm2 + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + psubw xmm4, xmm2 + + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm4 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm4, xmm4 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm4 + + movdqa xmm5, xmm3 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 + jnz aom_half_vert_variance16x_h_1 + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_half_horiz_variance16x_h_sse2(unsigned char *ref, +; int ref_stride +; unsigned char *src, +; int src_stride, +; unsigned int height, +; int *sum, +; unsigned int *sumsquared) +global sym(aom_half_horiz_variance16x_h_sse2) PRIVATE +sym(aom_half_horiz_variance16x_h_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + pxor xmm6, xmm6 ; error accumulator + pxor xmm7, xmm7 ; sse eaccumulator + mov rsi, arg(0) ;ref + + mov rdi, arg(2) ;src + movsxd rcx, dword ptr arg(4) ;height + movsxd rax, dword ptr arg(1) ;ref_stride + movsxd rdx, dword ptr arg(3) ;src_stride + + pxor xmm0, xmm0 ; + +aom_half_horiz_variance16x_h_1: + movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 + movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 + + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) + movdqa xmm1, xmm5 + punpcklbw xmm5, xmm0 ; xmm5 = words of above + punpckhbw xmm1, xmm0 + + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 + punpcklbw xmm3, xmm0 ; xmm3 = words of above + movq xmm2, QWORD PTR [rdi+8] + punpcklbw xmm2, xmm0 + + psubw xmm5, xmm3 ; xmm5 -= xmm3 + psubw xmm1, xmm2 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences + paddw xmm6, xmm1 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 + pmaddwd xmm1, xmm1 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences + paddd xmm7, xmm1 + + lea rsi, [rsi + rax] + lea rdi, [rdi + rdx] + + sub rcx, 1 ; + jnz aom_half_horiz_variance16x_h_1 ; + + pxor xmm1, xmm1 + pxor xmm5, xmm5 + + punpcklwd xmm0, xmm6 + punpckhwd xmm1, xmm6 + psrad xmm0, 16 + psrad xmm1, 16 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 + + movdqa xmm6, xmm7 + punpckldq xmm6, xmm5 + punpckhdq xmm7, xmm5 + paddd xmm6, xmm7 + + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 + paddd xmm0, xmm1 + + movdqa xmm7, xmm6 + movdqa xmm1, xmm0 + + psrldq xmm7, 8 + psrldq xmm1, 8 + + paddd xmm6, xmm7 + paddd xmm0, xmm1 + + mov rsi, arg(5) ;[Sum] + mov rdi, arg(6) ;[SSE] + + movd [rsi], xmm0 + movd [rdi], xmm6 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; +align 16 +xmm_bi_rd: + times 8 dw 64 +align 16 +aom_bilinear_filters_sse2: + dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 + dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 + dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 + dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 + dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 + dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 + dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c new file mode 100644 index 000000000..a99c0b40e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref, + int ref_stride, + const unsigned char *src, + int src_stride, unsigned int height, + int *sum, unsigned int *sumsquared); +void aom_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride, + const unsigned char *src, int src_stride, + unsigned int height, int *sum, + unsigned int *sumsquared); +void aom_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride, + const unsigned char *src, int src_stride, + unsigned int height, int *sum, + unsigned int *sumsquared); + +uint32_t aom_variance_halfpixvar16x16_h_sse2(const unsigned char *src, + int src_stride, + const unsigned char *dst, + int dst_stride, uint32_t *sse) { + int xsum0; + unsigned int xxsum0; + + aom_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, + &xsum0, &xxsum0); + + *sse = xxsum0; + assert(xsum0 <= 255 * 16 * 16); + assert(xsum0 >= -255 * 16 * 16); + return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); +} + +uint32_t aom_variance_halfpixvar16x16_v_sse2(const unsigned char *src, + int src_stride, + const unsigned char *dst, + int dst_stride, uint32_t *sse) { + int xsum0; + unsigned int xxsum0; + aom_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0, + &xxsum0); + + *sse = xxsum0; + assert(xsum0 <= 255 * 16 * 16); + assert(xsum0 >= -255 * 16 * 16); + return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); +} + +uint32_t aom_variance_halfpixvar16x16_hv_sse2(const unsigned char *src, + int src_stride, + const unsigned char *dst, + int dst_stride, uint32_t *sse) { + int xsum0; + unsigned int xxsum0; + + aom_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, + &xsum0, &xxsum0); + + *sse = xxsum0; + assert(xsum0 <= 255 * 16 * 16); + assert(xsum0 >= -255 * 16 * 16); + return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c new file mode 100644 index 000000000..7d96e26ae --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c @@ -0,0 +1,1151 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/convolve.h" + +#define CONV8_ROUNDING_BITS (7) + +static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + +static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15, + 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15 }; + +static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; + +typedef enum { PACK_8x1, PACK_8x2, PACK_16x1 } PixelPackFormat; + +typedef void (*WritePixels)(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch); + +// ----------------------------------------------------------------------------- +// Copy and average + +void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int width, int h, int bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + assert(width % 4 == 0); + if (width > 32) { // width = 64 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + _mm256_storeu_si256((__m256i *)(dst + 32), p2); + _mm256_storeu_si256((__m256i *)(dst + 48), p3); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 16) { // width = 32 + do { + const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + _mm256_storeu_si256((__m256i *)dst, p0); + _mm256_storeu_si256((__m256i *)(dst + 16), p1); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 8) { // width = 16 + __m256i p0, p1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + p1 = _mm256_loadu_si256((const __m256i *)src); + src += src_stride; + + _mm256_storeu_si256((__m256i *)dst, p0); + dst += dst_stride; + _mm256_storeu_si256((__m256i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (width > 4) { // width = 8 + __m128i p0, p1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + p1 = _mm_loadu_si128((const __m128i *)src); + src += src_stride; + + _mm_storeu_si128((__m128i *)dst, p0); + dst += dst_stride; + _mm_storeu_si128((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // width = 4 + __m128i p0, p1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + p1 = _mm_loadl_epi64((const __m128i *)src); + src += src_stride; + + _mm_storel_epi64((__m128i *)dst, p0); + dst += dst_stride; + _mm_storel_epi64((__m128i *)dst, p1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } +} + +void aom_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int width, int h, int bd) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; + (void)filter_y; + (void)filter_x_stride; + (void)filter_y_stride; + (void)bd; + + assert(width % 4 == 0); + if (width > 32) { // width = 64 + __m256i p0, p1, p2, p3, u0, u1, u2, u3; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); + p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); + u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); + _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 16) { // width = 32 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); + src += src_stride; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); + dst += dst_stride; + h--; + } while (h > 0); + } else if (width > 8) { // width = 16 + __m256i p0, p1, u0, u1; + do { + p0 = _mm256_loadu_si256((const __m256i *)src); + p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm256_loadu_si256((const __m256i *)dst); + u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); + + _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); + _mm256_storeu_si256((__m256i *)(dst + dst_stride), + _mm256_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else if (width > 4) { // width = 8 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadu_si128((const __m128i *)src); + p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadu_si128((const __m128i *)dst); + u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); + + _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); + _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } else { // width = 4 + __m128i p0, p1, u0, u1; + do { + p0 = _mm_loadl_epi64((const __m128i *)src); + p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); + src += src_stride << 1; + u0 = _mm_loadl_epi64((const __m128i *)dst); + u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); + + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); + _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); + dst += dst_stride << 1; + h -= 2; + } while (h > 0); + } +} + +// ----------------------------------------------------------------------------- +// Horizontal Filtering + +static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); + const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); + const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); + + p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 + p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 + p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 + p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 +} + +// Note: +// Shared by 8x2 and 16x1 block +static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, + __m256i *x /*x[8]*/) { + __m256i pp[8]; + pack_pixels(s0, pp); + pack_pixels(s1, &pp[4]); + x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); + x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); + x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); + x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); + x[4] = x[2]; + x[5] = x[3]; + x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); + x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); +} + +static INLINE void pack_pixels_with_format(const uint16_t *src, + PixelPackFormat fmt, + ptrdiff_t stride, __m256i *x) { + switch (fmt) { + case PACK_8x1: { + __m256i pp[8]; + __m256i s0; + s0 = _mm256_loadu_si256((const __m256i *)src); + pack_pixels(&s0, pp); + x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); + x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); + x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); + x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); + break; + } + case PACK_8x2: { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); + pack_16_pixels(&s0, &s1, x); + break; + } + case PACK_16x1: { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_pixels(&s0, &s1, x); + break; + } + default: { assert(0); } + } +} + +static INLINE void pack_8x1_pixels(const uint16_t *src, const ptrdiff_t pitch, + __m256i *x /*x[4]*/) { + pack_pixels_with_format(src, PACK_8x1, pitch, x); +} + +static INLINE void pack_8x2_pixels(const uint16_t *src, const ptrdiff_t pitch, + __m256i *x /*x[8]*/) { + pack_pixels_with_format(src, PACK_8x2, pitch, x); +} + +static INLINE void pack_16x1_pixels(const uint16_t *src, const ptrdiff_t pitch, + __m256i *x /*x[8]*/) { + pack_pixels_with_format(src, PACK_16x1, pitch, x); +} + +// Note: +// Shared by horizontal and vertical filtering +static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p0 = _mm256_set1_epi32(0x03020100); + const __m256i p1 = _mm256_set1_epi32(0x07060504); + const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); + const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); + f[0] = _mm256_shuffle_epi8(hh, p0); + f[1] = _mm256_shuffle_epi8(hh, p1); + f[2] = _mm256_shuffle_epi8(hh, p2); + f[3] = _mm256_shuffle_epi8(hh, p3); +} + +static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, + const __m256i *fil /*fil[4]*/, + __m256i *y) { + __m256i a, a0, a1; + + a0 = _mm256_madd_epi16(fil[0], sig[0]); + a1 = _mm256_madd_epi16(fil[3], sig[3]); + a = _mm256_add_epi32(a0, a1); + + a0 = _mm256_madd_epi16(fil[1], sig[1]); + a1 = _mm256_madd_epi16(fil[2], sig[2]); + + const __m256i min = _mm256_min_epi32(a0, a1); + a = _mm256_add_epi32(a, min); + + const __m256i max = _mm256_max_epi32(a0, a1); + a = _mm256_add_epi32(a, max); + + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + a = _mm256_add_epi32(a, rounding); + *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); +} + +static void write_8x1_pixels(const __m256i *y, const __m256i *z, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + const __m128i a0 = _mm256_castsi256_si128(*y); + const __m128i a1 = _mm256_extractf128_si256(*y, 1); + __m128i res = _mm_packus_epi32(a0, a1); + (void)z; + (void)pitch; + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void write_8x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static void write_16x1_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t dst_pitch) { + (void)dst_pitch; + __m256i a = _mm256_packus_epi32(*y0, *y1); + a = _mm256_min_epi16(a, *mask); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static void filter_block_width8_horiz( + const uint16_t *src_ptr, ptrdiff_t src_pitch, const WritePixels write_8x1, + const WritePixels write_8x2, uint16_t *dst_ptr, ptrdiff_t dst_pitch, + uint32_t height, const int16_t *filter, int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch); + } +} + +static void aom_highbd_filter_block1d8_h8_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_horiz(src, src_pitch, write_8x1_pixels, write_8x2_pixels, + dst, dst_pitch, height, filter, bd); +} + +static void filter_block_width16_horiz(const uint16_t *src_ptr, + ptrdiff_t src_pitch, + const WritePixels write_16x1, + uint16_t *dst_ptr, ptrdiff_t dst_pitch, + uint32_t height, const int16_t *filter, + int bd) { + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void aom_highbd_filter_block1d16_h8_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_horiz(src, src_pitch, write_16x1_pixels, dst, dst_pitch, + height, filter, bd); +} + +// 2-tap horizontal filtering + +static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); + const __m256i p = _mm256_set1_epi32(0x09080706); + f[0] = _mm256_shuffle_epi8(hh, p); +} + +// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() +// the difference is s0/s1 specifies first and second rows or, +// first 16 samples and 8-sample shifted 16 samples +static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, + __m256i *sig) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); + __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); + __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); + __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + r1 = _mm256_shuffle_epi8(r1, sf2); + sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); + sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); +} + +static INLINE void pack_8x2_2t_pixels(const uint16_t *src, + const ptrdiff_t pitch, __m256i *sig) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_16x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_2t_pixels(&r0, &r1, sig); +} + +static INLINE void pack_8x1_2t_pixels(const uint16_t *src, + __m256i *sig /*sig[2]*/) { + const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); + const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); + __m256i r0 = _mm256_loadu_si256((const __m256i *)src); + __m256i x0 = _mm256_shuffle_epi8(r0, sf2); + r0 = _mm256_permutevar8x32_epi32(r0, idx); + r0 = _mm256_shuffle_epi8(r0, sf2); + sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); +} + +// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() +static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + __m256i x1 = _mm256_madd_epi16(sig[1], *f); + x0 = _mm256_add_epi32(x0, rounding); + x1 = _mm256_add_epi32(x1, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static INLINE void filter_8x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static INLINE void filter_16x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0) { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m256i x0 = _mm256_madd_epi16(sig[0], *f); + x0 = _mm256_add_epi32(x0, rounding); + *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); +} + +static void filter_block_width8_2t_horiz( + const uint16_t *src_ptr, ptrdiff_t src_pitch, const WritePixels write_8x1, + const WritePixels write_8x2, uint16_t *dst_ptr, ptrdiff_t dst_pitch, + uint32_t height, const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_8x2_2t_pixels(signal, &ff, &res0, &res1); + write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch); + } +} + +static void aom_highbd_filter_block1d8_h2_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_2t_horiz(src, src_pitch, write_8x1_pixels, + write_8x2_pixels, dst, dst_pitch, height, filter, + bd); +} + +static void filter_block_width16_2t_horiz(const uint16_t *src_ptr, + ptrdiff_t src_pitch, + const WritePixels write_16x1, + uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16x1_2t_pixels(signal, &ff, &res0, &res1); + write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); +} + +static void aom_highbd_filter_block1d16_h2_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_2t_horiz(src, src_pitch, write_16x1_pixels, dst, + dst_pitch, height, filter, bd); +} + +// Vertical Filtering + +static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); + __m256i s1 = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); + __m256i s2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); + __m256i s3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); + __m256i s4 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); + __m256i s5 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); + __m256i s6 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); + + s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); + s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); + s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); + s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); + s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); + + sig[0] = _mm256_unpacklo_epi16(s0, s1); + sig[4] = _mm256_unpackhi_epi16(s0, s1); + sig[1] = _mm256_unpacklo_epi16(s2, s3); + sig[5] = _mm256_unpackhi_epi16(s2, s3); + sig[2] = _mm256_unpacklo_epi16(s4, s5); + sig[6] = _mm256_unpackhi_epi16(s4, s5); + sig[8] = s6; +} + +static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + __m256i s0 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); + // base + 8th row + __m256i s1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); + __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); + __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); + sig[3] = _mm256_unpacklo_epi16(s2, s3); + sig[7] = _mm256_unpackhi_epi16(s2, s3); + sig[8] = s1; +} + +static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_8x1_pixels(sig, f, y0); + filter_8x1_pixels(&sig[4], f, y1); +} + +static INLINE void update_pixels(__m256i *sig) { + int i; + for (i = 0; i < 3; ++i) { + sig[i] = sig[i + 1]; + sig[i + 4] = sig[i + 5]; + } +} + +static INLINE void write_8x1_pixels_ver(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)pitch; + const __m128i v0 = _mm256_castsi256_si128(*y0); + const __m128i v1 = _mm256_castsi256_si128(*y1); + __m128i p = _mm_packus_epi32(v0, v1); + p = _mm_min_epi16(p, _mm256_castsi256_si128(*mask)); + _mm_storeu_si128((__m128i *)dst, p); +} + +static void filter_block_width8_vert(const uint16_t *src_ptr, + ptrdiff_t src_pitch, WritePixels write_8x1, + WritePixels write_8x2, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 1); + + if (height > 0) { + pack_8x9_pixels(src_ptr, src_pitch, signal); + filter_8x9_pixels(signal, ff, &res0, &res1); + write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch); + } +} + +static void aom_highbd_filter_block1d8_v8_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_vert(src, src_pitch, write_8x1_pixels_ver, + write_8x2_pixels, dst, dst_pitch, height, filter, + bd); +} + +static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { + __m256i u0, u1, u2, u3; + // load 0-6 rows + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); + const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); + const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); + const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); + + u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low + u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high + + u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low + u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high + + sig[0] = _mm256_unpacklo_epi16(u0, u2); + sig[4] = _mm256_unpackhi_epi16(u0, u2); + + sig[8] = _mm256_unpacklo_epi16(u1, u3); + sig[12] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s2, s3, 0x20); + u1 = _mm256_permute2x128_si256(s2, s3, 0x31); + + u2 = _mm256_permute2x128_si256(s3, s4, 0x20); + u3 = _mm256_permute2x128_si256(s3, s4, 0x31); + + sig[1] = _mm256_unpacklo_epi16(u0, u2); + sig[5] = _mm256_unpackhi_epi16(u0, u2); + + sig[9] = _mm256_unpacklo_epi16(u1, u3); + sig[13] = _mm256_unpackhi_epi16(u1, u3); + + u0 = _mm256_permute2x128_si256(s4, s5, 0x20); + u1 = _mm256_permute2x128_si256(s4, s5, 0x31); + + u2 = _mm256_permute2x128_si256(s5, s6, 0x20); + u3 = _mm256_permute2x128_si256(s5, s6, 0x31); + + sig[2] = _mm256_unpacklo_epi16(u0, u2); + sig[6] = _mm256_unpackhi_epi16(u0, u2); + + sig[10] = _mm256_unpacklo_epi16(u1, u3); + sig[14] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s6; +} + +static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // base + 7th row + const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); + // base + 8th row + const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); + + __m256i u0, u1, u2, u3; + u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); + u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); + + u2 = _mm256_permute2x128_si256(s7, s8, 0x20); + u3 = _mm256_permute2x128_si256(s7, s8, 0x31); + + sig[3] = _mm256_unpacklo_epi16(u0, u2); + sig[7] = _mm256_unpackhi_epi16(u0, u2); + + sig[11] = _mm256_unpacklo_epi16(u1, u3); + sig[15] = _mm256_unpackhi_epi16(u1, u3); + + sig[16] = s8; +} + +static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + __m256i res[4]; + int i; + for (i = 0; i < 4; ++i) { + filter_8x1_pixels(&sig[i << 2], f, &res[i]); + } + + const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); + const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); + *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); + *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); +} + +static INLINE void write_16x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); + p = _mm256_min_epi16(*y1, *mask); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static INLINE void write_16x1_pixels_ver(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)y1; + (void)pitch; + const __m256i p = _mm256_min_epi16(*y0, *mask); + _mm256_storeu_si256((__m256i *)dst, p); +} + +static void update_16x9_pixels(__m256i *sig) { + update_pixels(&sig[0]); + update_pixels(&sig[8]); +} + +static void filter_block_width16_vert(const uint16_t *src_ptr, + ptrdiff_t src_pitch, + WritePixels write_16x1, + WritePixels write_16x2, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, + const int16_t *filter, int bd) { + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + write_16x2(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 1); + + if (height > 0) { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + } +} + +static void aom_highbd_filter_block1d16_v8_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_vert(src, src_pitch, write_16x1_pixels_ver, + write_16x2_pixels, dst, dst_pitch, height, filter, + bd); +} + +// 2-tap vertical filtering + +static void pack_16x2_init(const uint16_t *src, __m256i *sig) { + sig[2] = _mm256_loadu_si256((const __m256i *)src); +} + +static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, + __m256i *sig) { + // load the next row + const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); + sig[0] = _mm256_unpacklo_epi16(sig[2], u); + sig[1] = _mm256_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, + __m256i *y0, __m256i *y1) { + filter_16_2t_pixels(sig, f, y0, y1); +} + +static void filter_block_width16_2t_vert(const uint16_t *src_ptr, + ptrdiff_t src_pitch, + WritePixels write_16x1, + uint16_t *dst_ptr, ptrdiff_t dst_pitch, + uint32_t height, const int16_t *filter, + int bd) { + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static void aom_highbd_filter_block1d16_v2_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_2t_vert(src, src_pitch, write_16x1_pixels, dst, + dst_pitch, height, filter, bd); +} + +static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { + const __m128i h = _mm_loadu_si128((const __m128i *)filter); + const __m128i p = _mm_set1_epi32(0x09080706); + f[0] = _mm_shuffle_epi8(h, p); +} + +static void pack_8x2_init(const uint16_t *src, __m128i *sig) { + sig[2] = _mm_loadu_si128((const __m128i *)src); +} + +static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, + __m128i *sig) { + // load the next row + const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); + sig[0] = _mm_unpacklo_epi16(sig[2], u); + sig[1] = _mm_unpackhi_epi16(sig[2], u); + sig[2] = u; +} + +static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, + __m128i *y0, __m128i *y1) { + const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + __m128i x0 = _mm_madd_epi16(sig[0], *f); + __m128i x1 = _mm_madd_epi16(sig[1], *f); + x0 = _mm_add_epi32(x0, rounding); + x1 = _mm_add_epi32(x1, rounding); + *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); + *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); +} + +static void write_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + res = _mm_min_epi16(res, *mask); + _mm_storeu_si128((__m128i *)dst, res); +} + +typedef void (*Write8Pixels)(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst); + +static void filter_block_width8_2t_vert(const uint16_t *src_ptr, + ptrdiff_t src_pitch, + Write8Pixels write_8x1, + uint16_t *dst_ptr, ptrdiff_t dst_pitch, + uint32_t height, const int16_t *filter, + int bd) { + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; + + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); + + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + write_8x1(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} + +static void aom_highbd_filter_block1d8_v2_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_2t_vert(src, src_pitch, write_8x1_2t_pixels_ver, dst, + dst_pitch, height, filter, bd); +} + +// Calculation with averaging the input pixels + +static void write_8x1_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)y1; + (void)pitch; + const __m128i a0 = _mm256_castsi256_si128(*y0); + const __m128i a1 = _mm256_extractf128_si256(*y0, 1); + __m128i res = _mm_packus_epi32(a0, a1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void write_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); + const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); + const __m256i pix = + _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); + _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); +} + +static void write_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)pitch; + __m256i a = _mm256_packus_epi32(*y0, *y1); + const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); + a = _mm256_min_epi16(a, *mask); + a = _mm256_avg_epu16(a, pix); + _mm256_storeu_si256((__m256i *)dst, a); +} + +static INLINE void write_8x1_avg_pixels_ver(const __m256i *y0, + const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)pitch; + const __m128i v0 = _mm256_castsi256_si128(*y0); + const __m128i v1 = _mm256_castsi256_si128(*y1); + __m128i p = _mm_packus_epi32(v0, v1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + p = _mm_min_epi16(p, _mm256_castsi256_si128(*mask)); + p = _mm_avg_epu16(p, pix); + _mm_storeu_si128((__m128i *)dst, p); +} + +static INLINE void write_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); + const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); + __m256i p = _mm256_min_epi16(*y0, *mask); + p = _mm256_avg_epu16(p, pix0); + _mm256_storeu_si256((__m256i *)dst, p); + + p = _mm256_min_epi16(*y1, *mask); + p = _mm256_avg_epu16(p, pix1); + _mm256_storeu_si256((__m256i *)(dst + pitch), p); +} + +static INLINE void write_16x1_avg_pixels_ver(const __m256i *y0, + const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { + (void)y1; + (void)pitch; + __m256i p = _mm256_min_epi16(*y0, *mask); + const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); + p = _mm256_avg_epu16(p, pix); + _mm256_storeu_si256((__m256i *)dst, p); +} + +static void write_8x1_2t_avg_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { + __m128i res = _mm_packus_epi32(*y0, *y1); + const __m128i pix = _mm_loadu_si128((const __m128i *)dst); + res = _mm_min_epi16(res, *mask); + res = _mm_avg_epu16(res, pix); + _mm_storeu_si128((__m128i *)dst, res); +} + +static void aom_highbd_filter_block1d8_h8_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_horiz(src, src_pitch, write_8x1_avg_pixels, + write_8x2_avg_pixels, dst, dst_pitch, height, + filter, bd); +} + +static void aom_highbd_filter_block1d16_h8_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_horiz(src, src_pitch, write_16x1_avg_pixels, dst, + dst_pitch, height, filter, bd); +} + +static void aom_highbd_filter_block1d8_v8_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_vert(src, src_pitch, write_8x1_avg_pixels_ver, + write_8x2_avg_pixels, dst, dst_pitch, height, filter, + bd); +} + +static void aom_highbd_filter_block1d16_v8_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_vert(src, src_pitch, write_16x1_avg_pixels_ver, + write_16x2_avg_pixels, dst, dst_pitch, height, + filter, bd); +} + +// 2-tap averaging + +static void aom_highbd_filter_block1d8_h2_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_2t_horiz(src, src_pitch, write_8x1_avg_pixels, + write_8x2_avg_pixels, dst, dst_pitch, height, + filter, bd); +} + +static void aom_highbd_filter_block1d16_h2_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_2t_horiz(src, src_pitch, write_16x1_avg_pixels, dst, + dst_pitch, height, filter, bd); +} + +static void aom_highbd_filter_block1d16_v2_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width16_2t_vert(src, src_pitch, write_16x1_avg_pixels, dst, + dst_pitch, height, filter, bd); +} + +static void aom_highbd_filter_block1d8_v2_avg_avx2( + const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { + filter_block_width8_2t_vert(src, src_pitch, write_8x1_2t_avg_pixels_ver, dst, + dst_pitch, height, filter, bd); +} + +typedef void HbdFilter1dFunc(const uint16_t *, ptrdiff_t, uint16_t *, ptrdiff_t, + uint32_t, const int16_t *, int); + +#define HIGHBD_FUNC(width, dir, avg, opt) \ + aom_highbd_filter_block1d##width##_##dir##_##avg##opt + +HbdFilter1dFunc HIGHBD_FUNC(4, h8, , sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, h2, , sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, v8, , sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, v2, , sse2); + +#define aom_highbd_filter_block1d4_h8_avx2 HIGHBD_FUNC(4, h8, , sse2) +#define aom_highbd_filter_block1d4_h2_avx2 HIGHBD_FUNC(4, h2, , sse2) +#define aom_highbd_filter_block1d4_v8_avx2 HIGHBD_FUNC(4, v8, , sse2) +#define aom_highbd_filter_block1d4_v2_avx2 HIGHBD_FUNC(4, v2, , sse2) + +HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); +HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +HIGH_FUN_CONV_2D(, avx2); + +HbdFilter1dFunc HIGHBD_FUNC(4, h8, avg_, sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, h2, avg_, sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, v8, avg_, sse2); +HbdFilter1dFunc HIGHBD_FUNC(4, v2, avg_, sse2); + +#define aom_highbd_filter_block1d4_h8_avg_avx2 HIGHBD_FUNC(4, h8, avg_, sse2) +#define aom_highbd_filter_block1d4_h2_avg_avx2 HIGHBD_FUNC(4, h2, avg_, sse2) +#define aom_highbd_filter_block1d4_v8_avg_avx2 HIGHBD_FUNC(4, v8, avg_, sse2) +#define aom_highbd_filter_block1d4_v2_avg_avx2 HIGHBD_FUNC(4, v2, avg_, sse2) + +HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2); +HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, + avx2); +HIGH_FUN_CONV_2D(avg_, avx2); + +#undef HIGHBD_FUNC diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm new file mode 100644 index 000000000..5d84ef8a7 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm @@ -0,0 +1,456 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 4 dd 16 +pw_32: times 4 dd 32 + +SECTION .text +INIT_XMM sse2 +cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + movq m2, [leftq] + paddw m0, m2 + pshuflw m1, m0, 0xe + paddw m0, m1 + pshuflw m1, m0, 0x1 + paddw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, one + mov oned, 0x00010001 + lea stride3q, [strideq*3] + movd m3, oned + pshufd m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_8)] + psrlw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m3, [aboveq+16] + mova m2, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_16)] + psrad m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + paddw m0, m2 + paddw m3, m4 + mova m2, [leftq] + mova m4, [leftq+16] + mova m5, [leftq+32] + mova m6, [leftq+48] + paddw m2, m4 + paddw m5, m6 + paddw m0, m3 + paddw m2, m5 + pxor m1, m1 + paddw m0, m2 + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_32)] + psrad m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16 ], m0 + mova [dstq +32 ], m0 + mova [dstq +48 ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16 ], m0 + mova [dstq+strideq*2+32 ], m0 + mova [dstq+strideq*2+48 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4+16 ], m0 + mova [dstq+strideq*4+32 ], m0 + mova [dstq+strideq*4+48 ], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m0 + mova [dstq+stride3q*2 +32], m0 + mova [dstq+stride3q*2 +48], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m1 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + mova m2, [aboveq+32] + mova m3, [aboveq+48] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq +32], m2 + mova [dstq +48], m3 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*2 +32], m2 + mova [dstq+strideq*2 +48], m3 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+strideq*4 +32], m2 + mova [dstq+strideq*4 +48], m3 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m1 + mova [dstq+stride3q*2 +32], m2 + mova [dstq+stride3q*2 +48], m3 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps + movd m1, [aboveq-2] + movq m0, [aboveq] + pshuflw m1, m1, 0x0 + movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4 + movlhps m1, m1 ; tl tl tl tl tl tl tl tl + ; Get the values to compute the maximum value at this bit depth + pcmpeqw m3, m3 + movd m4, bpsd + psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl + psllw m3, m4 + pcmpeqw m2, m2 + pxor m4, m4 ; min possible value + pxor m3, m2 ; max possible value + mova m1, [leftq] + pshuflw m2, m1, 0x0 + pshuflw m5, m1, 0x55 + movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m2, m3 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m2 + movhpd [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + pshuflw m2, m1, 0xaa + pshuflw m5, m1, 0xff + movlhps m2, m5 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m2, m3 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m2 + movhpd [dstq+strideq*2], m2 + RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one + movd m1, [aboveq-2] + mova m0, [aboveq] + pshuflw m1, m1, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m3, m3 + pxor m4, m4 + pinsrw m3, oned, 0 + pinsrw m4, bpsd, 0 + pshuflw m3, m3, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m3, m3 + mov lineq, -4 + mova m2, m3 + punpcklqdq m1, m1 + psllw m3, m4 + add leftq, 16 + psubw m3, m2 ; max possible value + pxor m4, m4 ; min possible value + psubw m0, m1 +.loop: + movd m1, [leftq+lineq*4] + movd m2, [leftq+lineq*4+2] + pshuflw m1, m1, 0x0 + pshuflw m2, m2, 0x0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + paddw m1, m0 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m1, m3 + pminsw m2, m3 + pmaxsw m1, m4 + pmaxsw m2, m4 + ;Store the values + mova [dstq ], m1 + mova [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps + movd m2, [aboveq-2] + mova m0, [aboveq] + mova m1, [aboveq+16] + pshuflw m2, m2, 0x0 + ; Get the values to compute the maximum value at this bit depth + pcmpeqw m3, m3 + movd m4, bpsd + punpcklqdq m2, m2 + psllw m3, m4 + pcmpeqw m5, m5 + pxor m4, m4 ; min possible value + pxor m3, m5 ; max possible value + DEFINE_ARGS dst, stride, line, left + mov lineq, -8 + psubw m0, m2 + psubw m1, m2 +.loop: + movd m7, [leftq] + pshuflw m5, m7, 0x0 + pshuflw m2, m7, 0x55 + punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1 + punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2 + paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1 + paddw m5, m1 ; t5-tl+l1 to t8-tl+l1 + pminsw m6, m3 + pminsw m5, m3 + pmaxsw m6, m4 ; Clamp to the bit-depth + pmaxsw m5, m4 + mova [dstq ], m6 + mova [dstq +16], m5 + paddw m6, m2, m0 + paddw m2, m1 + pminsw m6, m3 + pminsw m2, m3 + pmaxsw m6, m4 + pmaxsw m2, m4 + mova [dstq+strideq*2 ], m6 + mova [dstq+strideq*2+16], m2 + lea dstq, [dstq+strideq*4] + inc lineq + lea leftq, [leftq+4] + + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps + movd m0, [aboveq-2] + mova m1, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + pshuflw m0, m0, 0x0 + ; Get the values to compute the maximum value at this bit depth + pcmpeqw m5, m5 + movd m6, bpsd + psllw m5, m6 + pcmpeqw m7, m7 + pxor m6, m6 ; min possible value + pxor m5, m7 ; max possible value + punpcklqdq m0, m0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -16 + psubw m1, m0 + psubw m2, m0 + psubw m3, m0 + psubw m4, m0 +.loop: + movd m7, [leftq] + pshuflw m7, m7, 0x0 + punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1 + paddw m0, m7, m1 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq ], m0 + paddw m0, m7, m2 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +16], m0 + paddw m0, m7, m3 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +32], m0 + paddw m0, m7, m4 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq +48], m0 + movd m7, [leftq+2] + pshuflw m7, m7, 0x0 + punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2 + paddw m0, m7, m1 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2 ], m0 + paddw m0, m7, m2 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+16], m0 + paddw m0, m7, m3 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+32], m0 + paddw m0, m7, m4 + pminsw m0, m5 + pmaxsw m0, m6 + mova [dstq+strideq*2+48], m0 + lea dstq, [dstq+strideq*4] + lea leftq, [leftq+4] + inc lineq + jnz .loop + REP_RET diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c new file mode 100644 index 000000000..76369871b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -0,0 +1,1140 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" + +static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { + __m128i ubounded; + __m128i lbounded; + __m128i retval; + + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + __m128i t80, max, min; + + if (bd == 8) { + t80 = _mm_set1_epi16(0x80); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80); + } else if (bd == 10) { + t80 = _mm_set1_epi16(0x200); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80); + } else { // bd == 12 + t80 = _mm_set1_epi16(0x800); + max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80); + } + + min = _mm_subs_epi16(zero, t80); + + ubounded = _mm_cmpgt_epi16(value, max); + lbounded = _mm_cmplt_epi16(value, min); + retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value); + ubounded = _mm_and_si128(ubounded, max); + lbounded = _mm_and_si128(lbounded, min); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_or_si128(retval, lbounded); + return retval; +} + +// TODO(debargha, peter): Break up large functions into smaller ones +// in this file. +void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + __m128i blimit, limit, thresh; + __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; + __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; + __m128i ps1, qs1, ps0, qs0; + __m128i abs_p0q0, abs_p1q1, ffff, work; + __m128i filt, work_a, filter1, filter2; + __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4; + __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1; + __m128i flat2_q0, flat2_p0; + __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3; + __m128i t4, t3, t80, t1; + __m128i eight, four; + + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + } + + q4 = _mm_load_si128((__m128i *)(s + 4 * p)); + p4 = _mm_load_si128((__m128i *)(s - 5 * p)); + q3 = _mm_load_si128((__m128i *)(s + 3 * p)); + p3 = _mm_load_si128((__m128i *)(s - 4 * p)); + q2 = _mm_load_si128((__m128i *)(s + 2 * p)); + p2 = _mm_load_si128((__m128i *)(s - 3 * p)); + q1 = _mm_load_si128((__m128i *)(s + 1 * p)); + p1 = _mm_load_si128((__m128i *)(s - 2 * p)); + q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + p0 = _mm_load_si128((__m128i *)(s - 1 * p)); + + // highbd_filter_mask + abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + + ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + + // highbd_hev_mask (in C code this is actually called from highbd_filter4) + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), + _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + + mask = _mm_subs_epu16(mask, limit); + mask = _mm_cmpeq_epi16(mask, zero); // return ~mask + + // lp filter + // highbd_filter4 + t4 = _mm_set1_epi16(4); + t3 = _mm_set1_epi16(3); + if (bd == 8) + t80 = _mm_set1_epi16(0x80); + else if (bd == 10) + t80 = _mm_set1_epi16(0x200); + else // bd == 12 + t80 = _mm_set1_epi16(0x800); + + t1 = _mm_set1_epi16(0x1); + + ps1 = _mm_subs_epi16(p1, t80); + qs1 = _mm_subs_epi16(q1, t80); + ps0 = _mm_subs_epi16(p0, t80); + qs0 = _mm_subs_epi16(q0, t80); + + filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), + hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + filt = _mm_and_si128(filt, mask); + filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); + filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + + // Filter1 >> 3 + filter1 = _mm_srai_epi16(filter1, 0x3); + filter2 = _mm_srai_epi16(filter2, 0x3); + + qs0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); + ps0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(hev, filt); + qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), + t80); + ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), + t80); + + // end highbd_filter4 + // loopfilter done + + // highbd_flat_mask4 + flat = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), + _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)), + _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); + flat = _mm_max_epi16(work, flat); + work = _mm_max_epi16(abs_p1p0, abs_q1q0); + flat = _mm_max_epi16(work, flat); + + if (bd == 8) + flat = _mm_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + + flat = _mm_cmpeq_epi16(flat, zero); + // end flat_mask4 + + // flat & mask = flat && mask (as used in filter8) + // (because, in both vars, each block of 16 either all 1s or all 0s) + flat = _mm_and_si128(flat, mask); + + p5 = _mm_load_si128((__m128i *)(s - 6 * p)); + q5 = _mm_load_si128((__m128i *)(s + 5 * p)); + p6 = _mm_load_si128((__m128i *)(s - 7 * p)); + q6 = _mm_load_si128((__m128i *)(s + 6 * p)); + p7 = _mm_load_si128((__m128i *)(s - 8 * p)); + q7 = _mm_load_si128((__m128i *)(s + 7 * p)); + + // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 + // but referred to as p0-p4 & q0-q4 in fn) + flat2 = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)), + _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4))); + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)), + _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5))); + flat2 = _mm_max_epi16(work, flat2); + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)), + _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6))); + flat2 = _mm_max_epi16(work, flat2); + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)), + _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7))); + flat2 = _mm_max_epi16(work, flat2); + + if (bd == 8) + flat2 = _mm_subs_epu16(flat2, one); + else if (bd == 10) + flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4)); + + flat2 = _mm_cmpeq_epi16(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + // end highbd_flat_mask5 + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + eight = _mm_set1_epi16(8); + four = _mm_set1_epi16(4); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + flat2_p0 = + _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4); + flat2_q0 = + _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4); + flat_p0 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3); + flat_q0 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3); + + sum_p7 = _mm_add_epi16(p7, p7); + sum_q7 = _mm_add_epi16(q7, q7); + sum_p3 = _mm_add_epi16(p3, p3); + sum_q3 = _mm_add_epi16(q3, q3); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6); + flat2_p1 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4); + flat2_q1 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2); + flat_p1 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3); + flat_q1 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + sum_p3 = _mm_add_epi16(sum_p3, p3); + sum_q3 = _mm_add_epi16(sum_q3, q3); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5); + flat2_p2 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4); + flat2_q2 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1); + flat_p2 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3); + flat_q2 = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4); + flat2_p3 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4); + flat2_q3 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3); + flat2_p4 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4); + flat2_q4 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2); + flat2_p5 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4); + flat2_q5 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4); + + sum_p7 = _mm_add_epi16(sum_p7, p7); + sum_q7 = _mm_add_epi16(sum_q7, q7); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1); + flat2_p6 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4); + flat2_q6 = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + // highbd_filter8 + p2 = _mm_andnot_si128(flat, p2); + // p2 remains unchanged if !(flat && mask) + flat_p2 = _mm_and_si128(flat, flat_p2); + // when (flat && mask) + p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values + + ps1 = _mm_andnot_si128(flat, ps1); + // p1 takes the value assigned to in in filter4 if !(flat && mask) + flat_p1 = _mm_and_si128(flat, flat_p1); + // when (flat && mask) + p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values + qs1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values + + ps0 = _mm_andnot_si128(flat, ps0); + // p0 takes the value assigned to in in filter4 if !(flat && mask) + flat_p0 = _mm_and_si128(flat, flat_p0); + // when (flat && mask) + p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values + qs0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values + // end highbd_filter8 + + // highbd_filter16 + p6 = _mm_andnot_si128(flat2, p6); + // p6 remains unchanged if !(flat2 && flat && mask) + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + // get values for when (flat2 && flat && mask) + p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values + q6 = _mm_andnot_si128(flat2, q6); + // q6 remains unchanged if !(flat2 && flat && mask) + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + // get values for when (flat2 && flat && mask) + q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values + _mm_store_si128((__m128i *)(s - 7 * p), p6); + _mm_store_si128((__m128i *)(s + 6 * p), q6); + + p5 = _mm_andnot_si128(flat2, p5); + // p5 remains unchanged if !(flat2 && flat && mask) + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + // get values for when (flat2 && flat && mask) + p5 = _mm_or_si128(p5, flat2_p5); + // full list of p5 values + q5 = _mm_andnot_si128(flat2, q5); + // q5 remains unchanged if !(flat2 && flat && mask) + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + // get values for when (flat2 && flat && mask) + q5 = _mm_or_si128(q5, flat2_q5); + // full list of q5 values + _mm_store_si128((__m128i *)(s - 6 * p), p5); + _mm_store_si128((__m128i *)(s + 5 * p), q5); + + p4 = _mm_andnot_si128(flat2, p4); + // p4 remains unchanged if !(flat2 && flat && mask) + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + // get values for when (flat2 && flat && mask) + p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values + q4 = _mm_andnot_si128(flat2, q4); + // q4 remains unchanged if !(flat2 && flat && mask) + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + // get values for when (flat2 && flat && mask) + q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values + _mm_store_si128((__m128i *)(s - 5 * p), p4); + _mm_store_si128((__m128i *)(s + 4 * p), q4); + + p3 = _mm_andnot_si128(flat2, p3); + // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + // get values for when (flat2 && flat && mask) + p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values + q3 = _mm_andnot_si128(flat2, q3); + // q3 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + // get values for when (flat2 && flat && mask) + q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values + _mm_store_si128((__m128i *)(s - 4 * p), p3); + _mm_store_si128((__m128i *)(s + 3 * p), q3); + + p2 = _mm_andnot_si128(flat2, p2); + // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + // get values for when (flat2 && flat && mask) + p2 = _mm_or_si128(p2, flat2_p2); + // full list of p2 values + q2 = _mm_andnot_si128(flat2, q2); + // q2 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + // get values for when (flat2 && flat && mask) + q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values + _mm_store_si128((__m128i *)(s - 3 * p), p2); + _mm_store_si128((__m128i *)(s + 2 * p), q2); + + p1 = _mm_andnot_si128(flat2, p1); + // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + // get values for when (flat2 && flat && mask) + p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values + q1 = _mm_andnot_si128(flat2, q1); + // q1 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + // get values for when (flat2 && flat && mask) + q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values + _mm_store_si128((__m128i *)(s - 2 * p), p1); + _mm_store_si128((__m128i *)(s + 1 * p), q1); + + p0 = _mm_andnot_si128(flat2, p0); + // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + // get values for when (flat2 && flat && mask) + p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values + q0 = _mm_andnot_si128(flat2, q0); + // q0 takes value from highbd_filter8 if !(flat2 && flat && mask) + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + // get values for when (flat2 && flat && mask) + q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values + _mm_store_si128((__m128i *)(s - 1 * p), p0); + _mm_store_si128((__m128i *)(s - 0 * p), q0); +} + +void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + aom_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd); + aom_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd); +} + +void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); + __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); + __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); + __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); + __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); + __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); + __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_shft; + + const __m128i t4 = _mm_set1_epi16(4); + const __m128i t3 = _mm_set1_epi16(3); + __m128i t80; + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i ps1, ps0, qs0, qs1; + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + t80 = _mm_set1_epi16(0x80); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + t80 = _mm_set1_epi16(0x200); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + t80 = _mm_set1_epi16(0x800); + } + + ps1 = _mm_subs_epi16(p1, t80); + ps0 = _mm_subs_epi16(p0, t80); + qs0 = _mm_subs_epi16(q0, t80); + qs1 = _mm_subs_epi16(q1, t80); + + // filter_mask and hev_mask + abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_max_epi16(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + mask = _mm_max_epi16(abs_q1q0, mask); + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + mask = _mm_subs_epu16(mask, limit); + mask = _mm_cmpeq_epi16(mask, zero); + + // flat_mask4 + flat = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), + _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2))); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)), + _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); + flat = _mm_max_epi16(work, flat); + flat = _mm_max_epi16(abs_p1p0, flat); + flat = _mm_max_epi16(abs_q1q0, flat); + + if (bd == 8) + flat = _mm_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op2[0], workp_shft); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op1[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_op0[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft); + + // lp filter + filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + filt = _mm_and_si128(filt, hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = signed_char_clamp_bd_sse2(filt, bd); + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi16(filt, t4); + filter2 = _mm_adds_epi16(filt, t3); + + // Filter1 >> 3 + filter1 = signed_char_clamp_bd_sse2(filter1, bd); + filter1 = _mm_srai_epi16(filter1, 3); + + // Filter2 >> 3 + filter2 = signed_char_clamp_bd_sse2(filter2, bd); + filter2 = _mm_srai_epi16(filter2, 3); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + filt = _mm_andnot_si128(hev, filt); + + work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd); + work_a = _mm_adds_epi16(work_a, t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd); + work_a = _mm_adds_epi16(work_a, t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd); + work_a = _mm_adds_epi16(work_a, t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd); + work_a = _mm_adds_epi16(work_a, t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_store_si128((__m128i *)(s - 3 * p), p2); + _mm_store_si128((__m128i *)(s - 2 * p), p1); + _mm_store_si128((__m128i *)(s - 1 * p), p0); + _mm_store_si128((__m128i *)(s + 0 * p), q0); + _mm_store_si128((__m128i *)(s + 1 * p), q1); + _mm_store_si128((__m128i *)(s + 2 * p), q2); +} + +void aom_highbd_lpf_horizontal_8_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); + aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); +} + +void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + const __m128i zero = _mm_set1_epi16(0); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); + const __m128i one = _mm_set1_epi16(1); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + __m128i work; + const __m128i t4 = _mm_set1_epi16(4); + const __m128i t3 = _mm_set1_epi16(3); + __m128i t80; + __m128i tff80; + __m128i tffe0; + __m128i t1f; + // equivalent to shifting 0x1f left by bitdepth - 8 + // and setting new bits to 1 + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i t7f; + // equivalent to shifting 0x7f left by bitdepth - 8 + // and setting new bits to 1 + __m128i ps1, ps0, qs0, qs1; + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + t80 = _mm_set1_epi16(0x80); + tff80 = _mm_set1_epi16(0xff80); + tffe0 = _mm_set1_epi16(0xffe0); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); + tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2); + tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); + tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4); + tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); + } + + ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + + // filter_mask and hev_mask + flat = _mm_max_epi16(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu16(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_max_epi16(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), + _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); + mask = _mm_max_epi16(work, mask); + work = _mm_max_epi16( + _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), + _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_max_epi16(work, mask); + mask = _mm_subs_epu16(mask, limit); + mask = _mm_cmpeq_epi16(mask, zero); + + // filter4 + filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + filt = _mm_and_si128(filt, hev); + work_a = _mm_subs_epi16(qs0, ps0); + filt = _mm_adds_epi16(filt, work_a); + filt = _mm_adds_epi16(filt, work_a); + filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); + filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0 + filter1 = _mm_and_si128(filter1, t1f); // clamp the range + filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi16(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, tffe0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + work_a = _mm_cmpgt_epi16(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, tff80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); + q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), + t80); + p0 = _mm_adds_epi16( + signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); + p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), + t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +} + +void aom_highbd_lpf_horizontal_4_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); + aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); +} + +static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], + int out_p, int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; + do { + uint16_t *in = src[idx8x8]; + uint16_t *out = dst[idx8x8]; + + p0 = + _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + p1 = + _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + p2 = + _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + p3 = + _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + p4 = + _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + p5 = + _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + p6 = + _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + p7 = + _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 + x0 = _mm_unpacklo_epi16(p0, p1); + // 20 30 21 31 22 32 23 33 + x1 = _mm_unpacklo_epi16(p2, p3); + // 40 50 41 51 42 52 43 53 + x2 = _mm_unpacklo_epi16(p4, p5); + // 60 70 61 71 62 72 63 73 + x3 = _mm_unpacklo_epi16(p6, p7); + // 00 10 20 30 01 11 21 31 + x4 = _mm_unpacklo_epi32(x0, x1); + // 40 50 60 70 41 51 61 71 + x5 = _mm_unpacklo_epi32(x2, x3); + // 00 10 20 30 40 50 60 70 + x6 = _mm_unpacklo_epi64(x4, x5); + // 01 11 21 31 41 51 61 71 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); + // 00 10 20 30 40 50 60 70 + _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); + // 01 11 21 31 41 51 61 71 + + // 02 12 22 32 03 13 23 33 + x4 = _mm_unpackhi_epi32(x0, x1); + // 42 52 62 72 43 53 63 73 + x5 = _mm_unpackhi_epi32(x2, x3); + // 02 12 22 32 42 52 62 72 + x6 = _mm_unpacklo_epi64(x4, x5); + // 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); + // 02 12 22 32 42 52 62 72 + _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); + // 03 13 23 33 43 53 63 73 + + // 04 14 05 15 06 16 07 17 + x0 = _mm_unpackhi_epi16(p0, p1); + // 24 34 25 35 26 36 27 37 + x1 = _mm_unpackhi_epi16(p2, p3); + // 44 54 45 55 46 56 47 57 + x2 = _mm_unpackhi_epi16(p4, p5); + // 64 74 65 75 66 76 67 77 + x3 = _mm_unpackhi_epi16(p6, p7); + // 04 14 24 34 05 15 25 35 + x4 = _mm_unpacklo_epi32(x0, x1); + // 44 54 64 74 45 55 65 75 + x5 = _mm_unpacklo_epi32(x2, x3); + // 04 14 24 34 44 54 64 74 + x6 = _mm_unpacklo_epi64(x4, x5); + // 05 15 25 35 45 55 65 75 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); + // 04 14 24 34 44 54 64 74 + _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); + // 05 15 25 35 45 55 65 75 + + // 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi32(x0, x1); + // 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi32(x2, x3); + // 06 16 26 36 46 56 66 76 + x6 = _mm_unpacklo_epi64(x4, x5); + // 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); + // 06 16 26 36 46 56 66 76 + _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); + // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, + uint16_t *out, int out_p) { + uint16_t *src0[1]; + uint16_t *src1[1]; + uint16_t *dest0[1]; + uint16_t *dest1[1]; + src0[0] = in0; + src1[0] = in1; + dest0[0] = out; + dest1[0] = out + 8; + highbd_transpose(src0, in_p, dest0, out_p, 1); + highbd_transpose(src1, in_p, dest1, out_p, 1); +} + +void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); + uint16_t *src[1]; + uint16_t *dst[1]; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + highbd_transpose(src, p, dst, 8, 1); + + // Loop filtering + aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + highbd_transpose(src, 8, dst, p, 1); +} + +void aom_highbd_lpf_vertical_4_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} + +void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); + uint16_t *src[1]; + uint16_t *dst[1]; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + highbd_transpose(src, p, dst, 8, 1); + + // Loop filtering + aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + highbd_transpose(src, 8, dst, p, 1); +} + +void aom_highbd_lpf_vertical_8_dual_sse2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} + +void aom_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); + uint16_t *src[2]; + uint16_t *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + highbd_transpose(src, p, dst, 8, 2); + + // Loop filtering + aom_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, + bd); + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + highbd_transpose(src, 8, dst, p, 2); +} + +void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[256]); + + // Transpose 16x16 + highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); + highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, + thresh, bd); + + // Transpose back + highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); + highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c new file mode 100644 index 000000000..3ee24ab16 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + __m128i zbins[2]; + __m128i nzbins[2]; + + zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1], + (int)zbin_ptr[0]); + zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + (void)scan; + + memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } + + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; + } + } + } + } + *eob_ptr = eob_i + 1; +} + +void aom_highbd_quantize_b_32x32_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); + (void)scan; + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + } + *eob_ptr = eob + 1; +} +#endif diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm new file mode 100644 index 000000000..0c7cb3998 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -0,0 +1,290 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_4x2x4 5-6 0 + movh m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + movhps m0, [srcq +%4*2] + movhps m4, [ref1q+%5*2] + movhps m5, [ref2q+%5*2] + movhps m6, [ref3q+%5*2] + movhps m7, [ref4q+%5*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + movu m2, [ref1q+%3*2] + movhps m0, [srcq +%4*2] + movhps m2, [ref1q+%5*2] + mova m3, m0 + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m4, m2 + + movu m2, [ref2q+%3*2] + mova m3, m0 + movhps m2, [ref2q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m5, m2 + + movu m2, [ref3q+%3*2] + mova m3, m0 + movhps m2, [ref3q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m6, m2 + + movu m2, [ref4q+%3*2] + mova m3, m0 + movhps m2, [ref4q+%5*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_8x2x4 5-6 0 + ; 1st 8 px + mova m0, [srcq +%2*2] +%if %1 == 1 + movu m4, [ref1q+%3*2] + movu m5, [ref2q+%3*2] + movu m6, [ref3q+%3*2] + movu m7, [ref4q+%3*2] + mova m3, m0 + mova m2, m0 + psubusw m3, m4 + psubusw m2, m5 + psubusw m4, m0 + psubusw m5, m0 + por m4, m3 + por m5, m2 + pmaddwd m4, m1 + pmaddwd m5, m1 + mova m3, m0 + mova m2, m0 + psubusw m3, m6 + psubusw m2, m7 + psubusw m6, m0 + psubusw m7, m0 + por m6, m3 + por m7, m2 + pmaddwd m6, m1 + pmaddwd m7, m1 +%else + mova m3, m0 + movu m2, [ref1q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+%3*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endif + + ; 2nd 8 px + mova m0, [srcq +(%4)*2] + mova m3, m0 + movu m2, [ref1q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m4, m2 + movu m2, [ref2q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m5, m2 + movu m2, [ref3q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 + por m2, m3 + mova m3, m0 + pmaddwd m2, m1 + paddd m6, m2 + movu m2, [ref4q+(%5)*2] + psubusw m3, m2 + psubusw m2, m0 +%if %6 == 1 + lea srcq, [srcq +src_strideq*4] + lea ref1q, [ref1q+ref_strideq*4] + lea ref2q, [ref2q+ref_strideq*4] + lea ref3q, [ref3q+ref_strideq*4] + lea ref4q, [ref4q+ref_strideq*4] +%endif + por m2, m3 + pmaddwd m2, m1 + paddd m7, m2 +%endmacro + +; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_16x2x4 5-6 0 + HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) + HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 +%endmacro + +; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_32x2x4 5-6 0 + HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) + HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 +%endmacro + +; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro HIGH_PROCESS_64x2x4 5-6 0 + HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) + HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 +%endmacro + +; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 +%macro HIGH_SADNXN4D 2 +%if UNIX64 +cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + +; set m1 + push srcq + mov srcd, 0x00010001 + movd m1, srcd + pshufd m1, m1, 0x0 + pop srcq + + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + +; convert byte pointers to short pointers + shl srcq, 1 + shl ref2q, 1 + shl ref3q, 1 + shl ref4q, 1 + shl ref1q, 1 + + HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + ; N.B. HIGH_PROCESS outputs dwords (32 bits) + ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM + movhlps m0, m4 + movhlps m1, m5 + movhlps m2, m6 + movhlps m3, m7 + paddd m4, m0 + paddd m5, m1 + paddd m6, m2 + paddd m7, m3 + punpckldq m4, m5 + punpckldq m6, m7 + movhlps m0, m4 + movhlps m1, m6 + paddd m4, m0 + paddd m6, m1 + punpcklqdq m4, m6 + movifnidn r4, r4mp + movu [r4], m4 + RET +%endmacro + + +INIT_XMM sse2 +HIGH_SADNXN4D 64, 64 +HIGH_SADNXN4D 64, 32 +HIGH_SADNXN4D 32, 64 +HIGH_SADNXN4D 32, 32 +HIGH_SADNXN4D 32, 16 +HIGH_SADNXN4D 16, 32 +HIGH_SADNXN4D 16, 16 +HIGH_SADNXN4D 16, 8 +HIGH_SADNXN4D 8, 16 +HIGH_SADNXN4D 8, 8 +HIGH_SADNXN4D 8, 4 +HIGH_SADNXN4D 4, 8 +HIGH_SADNXN4D 4, 4 diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm new file mode 100644 index 000000000..8427b891c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm @@ -0,0 +1,366 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro HIGH_SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +; convert src, ref & second_pred to short ptrs (from byte ptrs) + shl srcq, 1 + shl refq, 1 +%if %4 == 1 + shl second_predq, 1 +%endif +%endmacro + +; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD64XN 1-2 0 + HIGH_SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + ; first half of each row + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + paddd m0, m1 + paddd m0, m3 + ; second half of each row + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq+64] + psubusw m5, m1 + psubusw m1, [srcq+64] + por m1, m5 + mova m5, [srcq+80] + psubusw m5, m2 + psubusw m2, [srcq+80] + por m2, m5 + mova m5, [srcq+96] + psubusw m5, m3 + psubusw m3, [srcq+96] + por m3, m5 + mova m5, [srcq+112] + psubusw m5, m4 + psubusw m4, [srcq+112] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 +HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 +HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 +HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 + + +; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD32XN 1-2 0 + HIGH_SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+32] + psubusw m5, m3 + psubusw m3, [srcq+32] + por m3, m5 + mova m5, [srcq+48] + psubusw m5, m4 + psubusw m4, [srcq+48] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 +HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 +HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 +HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 +HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 +HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 + +; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD16XN 1-2 0 + HIGH_SAD_FN 16, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_strideq*2+16] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+16] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*2+16] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+16] + psubusw m5, m2 + psubusw m2, [srcq+16] + por m2, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*2] + por m3, m5 + mova m5, [srcq+src_strideq*2+16] + psubusw m5, m4 + psubusw m4, [srcq+src_strideq*2+16] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 +HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 +HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 +HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 +HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 +HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 + + +; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro HIGH_SAD8XN 1-2 0 + HIGH_SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + pxor m6, m6 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq*2] + movu m3, [refq+ref_strideq*4] + movu m4, [refq+ref_stride3q*2] +%if %2 == 1 + pavgw m1, [second_predq+mmsize*0] + pavgw m2, [second_predq+mmsize*1] + pavgw m3, [second_predq+mmsize*2] + pavgw m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + mova m5, [srcq] + psubusw m5, m1 + psubusw m1, [srcq] + por m1, m5 + mova m5, [srcq+src_strideq*2] + psubusw m5, m2 + psubusw m2, [srcq+src_strideq*2] + por m2, m5 + mova m5, [srcq+src_strideq*4] + psubusw m5, m3 + psubusw m3, [srcq+src_strideq*4] + por m3, m5 + mova m5, [srcq+src_stride3q*2] + psubusw m5, m4 + psubusw m4, [srcq+src_stride3q*2] + por m4, m5 + paddw m1, m2 + paddw m3, m4 + movhlps m2, m1 + movhlps m4, m3 + paddw m1, m2 + paddw m3, m4 + punpcklwd m1, m6 + punpcklwd m3, m6 + lea refq, [refq+ref_strideq*8] + paddd m0, m1 + lea srcq, [srcq+src_strideq*8] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + punpckldq m0, m6 + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 +HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 +HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 +HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 +HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 +HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm new file mode 100644 index 000000000..797e9c1d4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -0,0 +1,1040 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + mova %4, %3 ; make copies to manipulate to calc sum + mova %2, %1 ; use originals for calc sse + pmaddwd %3, %3 + paddw %4, %2 + pmaddwd %1, %1 + movhlps %2, %4 + paddd %6, %3 + paddw %4, %2 + pxor %2, %2 + pcmpgtw %2, %4 ; mask for 0 > %4 (sum) + punpcklwd %4, %2 ; sign-extend word to dword + paddd %6, %1 + paddd %5, %4 + +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + movhlps m3, m7 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + pshufd m4, m6, 0x1 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + movd [r1], m7 ; store sse + movd rax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp + add srcq, src_stridemp +%else + lea srcq, [srcq + src_strideq*2] +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 + + +%ifdef PIC ; 64bit PIC + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if ARCH_X86=1 && CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse, g_bilin_filter, g_pw_8 + %define block_height dword heightm + %define sec_str sec_stridemp + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, height, \ + sse, g_bilin_filter, g_pw_8 + %define block_height heightd + + ; Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %endif + %else + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse + %if ARCH_X86_64 + %define block_height heightd + %define sec_str sec_strideq + %else + %define block_height dword heightm + %define sec_str sec_stridemp + %endif + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, height, sse + %define block_height heightd + %endif + + %define bilin_filter bilin_filter_m + %endif +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + +%if %1 < 16 + sar block_height, 1 +%endif +%if %2 == 1 ; avg + shl sec_str, 1 +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq + 16] + mova m1, [dstq] + mova m3, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+16] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq + src_strideq*2] + mova m1, [dstq] + mova m3, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pavgw m0, m1 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m4, m1 + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + mova m2, [dstq] + mova m3, [dstq + 16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + mova m2, [dstq] + mova m3, [dstq + dst_strideq*2] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m1, m3 + mova m4, [dstq] + mova m5, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m2, m3 + mova m4, [dstq] + mova m5, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m1, filter_rnd + paddw m1, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m1, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m4, filter_rnd + paddw m4, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m4, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m4, [secq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + mova m4, [dstq] + mova m5, [dstq+16] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m2, [srcq+2] + movu m3, [srcq+src_strideq*2+2] + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m1, [secq] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+16] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m1, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m2, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m2, [secq] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +; loading filter - this is same as in 8-bit depth +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [bilin_filter+y_offsetq] + mova m11, [bilin_filter+y_offsetq+16] + mova m12, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif +; end of load filter + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq+2] + movu m1, [srcq+16] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + movu m3, [srcq+16] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m1, filter_rnd + mova m2, [dstq] + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 2] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + INC_SRC_BY_SRC_STRIDE + movu m3, [srcq] + movu m5, [srcq+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m4, filter_rnd + mova m2, [dstq] + paddw m4, m3 + psrlw m0, 4 + psrlw m4, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + add secq, sec_str + pavgw m4, [secq] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 4] +%if %2 == 1 ; avg + add secq, sec_str +%endif +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c new file mode 100644 index 000000000..7bc8a0df3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, + ptrdiff_t pred_stride); + +static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i x0, x1, x2, x3; + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + + _mm_storel_epi64((__m128i *)store_diff, x0); + store_diff = (int64_t *)(diff + 1 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x1); + store_diff = (int64_t *)(diff + 2 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x2); + store_diff = (int64_t *)(diff + 3 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x3); +} + +static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride); + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + x4 = _mm_sub_epi16(u4, v4); + x5 = _mm_sub_epi16(u5, v5); + x6 = _mm_sub_epi16(u6, v6); + x7 = _mm_sub_epi16(u7, v7); + + _mm_storel_epi64((__m128i *)store_diff, x0); + store_diff = (int64_t *)(diff + 1 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x1); + store_diff = (int64_t *)(diff + 2 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x2); + store_diff = (int64_t *)(diff + 3 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x3); + store_diff = (int64_t *)(diff + 4 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x4); + store_diff = (int64_t *)(diff + 5 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x5); + store_diff = (int64_t *)(diff + 6 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x6); + store_diff = (int64_t *)(diff + 7 * diff_stride); + _mm_storel_epi64((__m128i *)store_diff, x7); +} + +static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + __m128i x0, x1, x2, x3; + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); +} + +static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + + u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); + u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); + u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride)); + u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride)); + u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride)); + u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride)); + + v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride)); + v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride)); + v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride)); + v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride)); + v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride)); + v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride)); + v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride)); + v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride)); + + x0 = _mm_sub_epi16(u0, v0); + x1 = _mm_sub_epi16(u1, v1); + x2 = _mm_sub_epi16(u2, v2); + x3 = _mm_sub_epi16(u3, v3); + x4 = _mm_sub_epi16(u4, v4); + x5 = _mm_sub_epi16(u5, v5); + x6 = _mm_sub_epi16(u6, v6); + x7 = _mm_sub_epi16(u7, v7); + + _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0); + _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1); + _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2); + _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3); + _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4); + _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5); + _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6); + _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); +} + +static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 3; + src += src_stride << 3; + pred += pred_stride << 3; + subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += 8; + src += 8; + pred += 8; + subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 3; + src += src_stride << 3; + pred += pred_stride << 3; + subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 4; + src += src_stride << 4; + pred += pred_stride << 4; + subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += 16; + src += 16; + pred += 16; + subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 4; + src += src_stride << 4; + pred += pred_stride << 4; + subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 5; + src += src_stride << 5; + pred += pred_stride << 5; + subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += 32; + src += 32; + pred += 32; + subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 5; + src += src_stride << 5; + pred += pred_stride << 5; + subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 6; + src += src_stride << 6; + pred += pred_stride << 6; + subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += 64; + src += 64; + pred += 64; + subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride, + const uint16_t *src, ptrdiff_t src_stride, + const uint16_t *pred, ptrdiff_t pred_stride) { + subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride); + diff += diff_stride << 6; + src += src_stride << 6; + pred += pred_stride << 6; + subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride); +} + +static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { + SubtractWxHFuncType ret_func_ptr = NULL; + if (rows == 4) { + if (cols == 4) { + ret_func_ptr = subtract_4x4; + } else if (cols == 8) { + ret_func_ptr = subtract_8x4; + } + } else if (rows == 8) { + if (cols == 4) { + ret_func_ptr = subtract_4x8; + } else if (cols == 8) { + ret_func_ptr = subtract_8x8; + } else if (cols == 16) { + ret_func_ptr = subtract_16x8; + } + } else if (rows == 16) { + if (cols == 8) { + ret_func_ptr = subtract_8x16; + } else if (cols == 16) { + ret_func_ptr = subtract_16x16; + } else if (cols == 32) { + ret_func_ptr = subtract_32x16; + } + } else if (rows == 32) { + if (cols == 16) { + ret_func_ptr = subtract_16x32; + } else if (cols == 32) { + ret_func_ptr = subtract_32x32; + } else if (cols == 64) { + ret_func_ptr = subtract_64x32; + } + } else if (rows == 64) { + if (cols == 32) { + ret_func_ptr = subtract_32x64; + } else if (cols == 64) { + ret_func_ptr = subtract_64x64; + } else if (cols == 128) { + ret_func_ptr = subtract_128x64; + } + } else if (rows == 128) { + if (cols == 64) { + ret_func_ptr = subtract_64x128; + } else if (cols == 128) { + ret_func_ptr = subtract_128x128; + } + } + if (!ret_func_ptr) { + assert(0); + } + return ret_func_ptr; +} + +void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src8, + ptrdiff_t src_stride, const uint8_t *pred8, + ptrdiff_t pred_stride, int bd) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + SubtractWxHFuncType func; + (void)bd; + + func = getSubtractFunc(rows, cols); + func(diff, diff_stride, src, src_stride, pred, pred_stride); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm new file mode 100644 index 000000000..cf8ea498c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm @@ -0,0 +1,316 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +;unsigned int aom_highbd_calc16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(aom_highbd_calc16x16var_sse2) PRIVATE +sym(aom_highbd_calc16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+16] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax+16] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + + prefetcht0 [rdi] + prefetcht0 [rdi+16] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +.var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 2 + jnz .var16loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int aom_highbd_calc8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(aom_highbd_calc8x8var_sse2) PRIVATE +sym(aom_highbd_calc8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 8 + +.var8loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rbx+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + lea rbx, [rbx+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 4 + jnz .var8loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c new file mode 100644 index 000000000..29f96ce24 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -0,0 +1,695 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" + +typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); +} + +#define HIGH_GET_VAR(S) \ + void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + } \ + \ + void aom_highbd_10_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + } \ + \ + void aom_highbd_12_get##S##x##S##var_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ + sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + } + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_8_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_8_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ + } \ + \ + uint32_t aom_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); + +#undef VAR_FN + +unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, + aom_highbd_calc8x8var_sse2, 8); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in +// highbd_subpel_variance_impl_sse2.asm +#define DECL(w, opt) \ + int aom_highbd_sub_pixel_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, int height, \ + unsigned int *sse, void *unused0, void *unused); +#define DECLS(opt) \ + DECL(8, opt); \ + DECL(16, opt) + +DECLS(sse2); + +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int64_t var; \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ + int start_row; \ + uint32_t sse; \ + int se = 0; \ + int64_t var; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ + NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ + &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); + +FNS(sse2); + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \ + const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused); +#define DECLS(opt1) \ + DECL(16, opt1) \ + DECL(8, opt1) + +DECLS(sse2); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int64_t var; \ + uint32_t sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + uint32_t sse2; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ + const uint8_t *sec8) { \ + int start_row; \ + int64_t var; \ + uint32_t sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + for (start_row = 0; start_row < h; start_row += 16) { \ + uint32_t sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, y_offset, \ + dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ + w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \ + sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ + sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ + sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define FNS(opt1) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ + FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt1, (int64_t)); + +FNS(sse2); + +#undef FNS +#undef FN + +void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height, + const uint8_t *ref8, int ref_stride) { + int i, j; + int stride = ref_stride << 3; + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); + __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); + __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); + __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t2 = _mm_unpacklo_epi16(s4, s5); + t3 = _mm_unpacklo_epi16(s6, s7); + t0 = _mm_unpacklo_epi32(t0, t1); + t2 = _mm_unpacklo_epi32(t2, t3); + t0 = _mm_unpacklo_epi64(t0, t2); + + _mm_storeu_si128((__m128i *)(comp_pred), t0); + comp_pred += 8; + ref += 64; // 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t0 = _mm_unpacklo_epi32(t0, t1); + + _mm_storel_epi64((__m128i *)(comp_pred), t0); + comp_pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} + +void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, + int ref_stride) { + const __m128i one = _mm_set1_epi16(1); + int i, j; + int stride = ref_stride << 3; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); + __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); + __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); + __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t2 = _mm_unpacklo_epi16(s4, s5); + t3 = _mm_unpacklo_epi16(s6, s7); + t0 = _mm_unpacklo_epi32(t0, t1); + t2 = _mm_unpacklo_epi32(t2, t3); + t0 = _mm_unpacklo_epi64(t0, t2); + + p0 = _mm_adds_epu16(t0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + + _mm_storeu_si128((__m128i *)(comp_pred), p0); + comp_pred += 8; + pred += 8; + ref += 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); + __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); + __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); + __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi16(s0, s1); + t1 = _mm_unpacklo_epi16(s2, s3); + t0 = _mm_unpacklo_epi32(t0, t1); + + p0 = _mm_adds_epu16(t0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + + _mm_storel_epi64((__m128i *)(comp_pred), p0); + comp_pred += 4; + pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c new file mode 100644 index 000000000..cc7f52811 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include /* SSE4.1 */ + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_dsp/variance.h" +#include "aom_dsp/aom_filter.h" + +static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + uint64_t *sse, int64_t *sum) { + __m128i u0, u1, u2, u3; + __m128i s0, s1, s2, s3; + __m128i t0, t1, x0, y0; + __m128i a0, a1, a2, a3; + __m128i b0, b1, b2, b3; + __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride)); + a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride)); + a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride)); + a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride)); + + b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride)); + b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride)); + b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride)); + b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride)); + + u0 = _mm_unpacklo_epi16(a0, a1); + u1 = _mm_unpacklo_epi16(a2, a3); + u2 = _mm_unpacklo_epi16(b0, b1); + u3 = _mm_unpacklo_epi16(b2, b3); + + s0 = _mm_sub_epi16(u0, u2); + s1 = _mm_sub_epi16(u1, u3); + + t0 = _mm_madd_epi16(s0, k_one_epi16); + t1 = _mm_madd_epi16(s1, k_one_epi16); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + y0 = _mm_hadd_epi32(s3, s3); + + t0 = _mm_madd_epi16(s0, s0); + t1 = _mm_madd_epi16(s1, s1); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + x0 = _mm_hadd_epi32(s3, s3); + + *sse = (uint64_t)_mm_extract_epi32(x0, 0); + *sum = (int64_t)_mm_extract_epi32(y0, 0); +} + +uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)local_sse; + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return (diff >= 0) ? (uint32_t)diff : 0; +} + +uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4); + sum = ROUND_POWER_OF_TWO(sum, 2); + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return (diff >= 0) ? (uint32_t)diff : 0; +} + +uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + uint32_t *sse) { + int64_t sum, diff; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8); + sum = ROUND_POWER_OF_TWO(sum, 4); + + diff = (int64_t)*sse - ((sum * sum) >> 4); + return diff >= 0 ? (uint32_t)diff : 0; +} + +// Sub-pixel +uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, + sse); +} + +uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); +} + +uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, + dst_stride, sse); +} + +// Sub-pixel average + +uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), + 4); + + return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, + sse); +} + +uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), + 4); + + return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); +} + +uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, uint32_t *sse, + const uint8_t *second_pred) { + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + aom_highbd_var_filter_block2d_bil_first_pass( + src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); + aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), + 4); + + return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, + dst_stride, sse); +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm new file mode 100644 index 000000000..02567db49 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm @@ -0,0 +1,771 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pb_1: times 16 db 1 +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +dc_128: times 16 db 128 +pw2_4: times 8 dw 2 +pw2_8: times 8 dw 4 +pw2_16: times 8 dw 8 +pw2_32: times 8 dw 16 + +SECTION .text + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM sse2 +cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + movd m2, [leftq] + movd m0, [aboveq] + pxor m1, m1 + punpckldq m0, m2 + psadbw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [leftq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [aboveq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + paddw m0, [GLOBAL(pw_8)] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movd m0, [GLOBAL(dc_128)] + movd [dstq ], m0 + movd [dstq+strideq ], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq m0, [GLOBAL(dc_128)] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_16)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + + +INIT_XMM sse2 +cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + + +INIT_XMM sse2 +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + psadbw m3, m1 + psadbw m4, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_32)] + psraw m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + mova m2, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left + movifnidn leftq, leftmp + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 + pshufd m1, m0, 0x1 + movd [dstq ], m0 + movd [dstq+strideq], m1 + pshufd m2, m0, 0x2 + lea dstq, [dstq+strideq*2] + pshufd m3, m0, 0x3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -2 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] + movq m0, [leftq ] + punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 +.loop: + pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 + pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 + movq [dstq ], m1 + movq [dstq+strideq], m2 + pshuflw m1, m0, 0xaa + pshuflw m2, m0, 0xff + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 + inc lineq + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -4 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+strideq ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -8 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16 ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2 ], m1 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left + pxor m1, m1 + movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x + punpcklbw m0, m1 + pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word] + psrldq m0, 2 + psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] + movd m2, [leftq] + punpcklbw m2, m1 + pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] + pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] + paddw m4, m0 + paddw m3, m0 + packuswb m4, m4 + packuswb m3, m3 + movd [dstq ], m4 + movd [dstq+strideq], m3 + lea dstq, [dstq+strideq*2] + pshuflw m4, m2, 0xaa + pshuflw m3, m2, 0xff + paddw m4, m0 + paddw m3, m0 + packuswb m4, m4 + packuswb m3, m3 + movd [dstq ], m4 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + movq m0, [aboveq] + punpcklbw m2, m1 + punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] + pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] + DEFINE_ARGS dst, stride, line, left + mov lineq, -4 + punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] + psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] + movq m2, [leftq] + punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] +.loop: + pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] + pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] + punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] + punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] + paddw m4, m0 + paddw m3, m0 + packuswb m4, m3 + movq [dstq ], m4 + movhps [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m2, 4 + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left + pxor m1, m1 + mova m2, [aboveq-16]; + mova m0, [aboveq] ; t1 t2 ... t16 [byte] + punpckhbw m2, m1 ; [127:112] tl [word] + punpckhbw m4, m0, m1 + punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word] + DEFINE_ARGS dst, stride, line, left, stride8 + mov lineq, -8 + pshufhw m2, m2, 0xff + mova m3, [leftq] ; l1 l2 ... l16 [byte] + punpckhqdq m2, m2 ; tl repeated 8 times [word] + psubw m0, m2 + psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word] + punpckhbw m5, m3, m1 + punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word] + lea stride8q, [strideq*8] +.loop: + pshuflw m6, m3, 0x0 + pshuflw m7, m5, 0x0 + punpcklqdq m6, m6 ; l1 repeated 8 times [word] + punpcklqdq m7, m7 ; l8 repeated 8 times [word] + paddw m1, m6, m0 + paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word] + psrldq m5, 2 + packuswb m1, m6 + mova [dstq ], m1 + paddw m1, m7, m0 + paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word] + psrldq m3, 2 + packuswb m1, m7 + mova [dstq+stride8q], m1 + inc lineq + lea dstq, [dstq+strideq] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left + pxor m1, m1 + movd m2, [aboveq-1] + mova m0, [aboveq] + mova m4, [aboveq+16] + punpcklbw m2, m1 + punpckhbw m3, m0, m1 + punpckhbw m5, m4, m1 + punpcklbw m0, m1 + punpcklbw m4, m1 + pshuflw m2, m2, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -16 + punpcklqdq m2, m2 + add leftq, 32 + psubw m0, m2 + psubw m3, m2 + psubw m4, m2 + psubw m5, m2 +.loop: + movd m2, [leftq+lineq*2] + pxor m1, m1 + punpcklbw m2, m1 + pshuflw m7, m2, 0x55 + pshuflw m2, m2, 0x0 + punpcklqdq m2, m2 + punpcklqdq m7, m7 + paddw m6, m2, m3 + paddw m1, m2, m0 + packuswb m1, m6 + mova [dstq ], m1 + paddw m6, m2, m5 + paddw m1, m2, m4 + packuswb m1, m6 + mova [dstq+16 ], m1 + paddw m6, m7, m3 + paddw m1, m7, m0 + packuswb m1, m6 + mova [dstq+strideq ], m1 + paddw m6, m7, m5 + paddw m1, m7, m4 + packuswb m1, m6 + mova [dstq+strideq+16], m1 + lea dstq, [dstq+strideq*2] + inc lineq + jnz .loop + REP_RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm b/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm new file mode 100644 index 000000000..bc1bb2ff3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm @@ -0,0 +1,410 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA + +pb_1: times 16 db 1 +sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 +sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 +sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 +sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +SECTION .text + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM ssse3 +cglobal d63e_predictor_4x4, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + pshufb m1, m3, [GLOBAL(sh_b23456777)] + pshufb m2, m3, [GLOBAL(sh_b12345677)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movd [dstq ], m3 + movd [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m3, 1 + psrldq m4, 1 + movd [dstq ], m3 + movd [dstq+strideq], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + movd m0, [leftq] ; l1, l2, l3, l4 + movd m1, [aboveq-1] ; tl, t1, t2, t3 + punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 + pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 + psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 + psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 + ; comments below are for a predictor like this + ; A1 B1 C1 D1 + ; A2 B2 A1 B1 + ; A3 B3 A2 B2 + ; A4 B4 A3 B3 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 + pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 + + punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+stride3q ], m3 + psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq*2], m3 + psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq ], m3 + psrldq m3, 2 ; A1 B1 C1 D1 .. + movd [dstq ], m3 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + movq m0, [leftq] ; [0- 7] l1-8 [byte] + movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] + pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] + pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] + pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] + pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] + psrldq m4, m0, 1 ; t1-7 [word] + psrldq m5, m0, 2 ; t2-7 [word] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 + ; A2 B2 A1 B1 C1 D1 E1 F1 + ; A3 B3 A2 B2 A1 B1 C1 D1 + ; A4 B4 A3 B3 A2 B2 A1 B1 + ; A5 B5 A4 B4 A3 B3 A2 B2 + ; A6 B6 A5 B5 A4 B4 A3 B3 + ; A7 B7 A6 B6 A5 B5 A4 B4 + ; A8 B8 A7 B7 A6 B6 A5 B5 + pavgb m6, m1, m2 ; 2-tap avg A8-A1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 + + punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 + palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 + movq [dstq+strideq*2], m0 + psrldq m0, 2 ; A-B2, A-B1, C-H1 + movq [dstq+strideq ], m0 + psrldq m0, 2 ; A-H1 + movq [dstq ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 + psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 + movq [dstq+strideq*2], m6 + psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 + movq [dstq+strideq ], m6 + psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 + movq [dstq ], m6 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 + ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 + ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 + ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 + ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 + ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 + ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 + ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 + ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 + ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 + ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 + ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 + ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 + ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 + ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 + ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 + pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m6, 15 + palignr m3, m0, m6, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] + pavgb m5, m0 ; A1 - Ag + + punpcklbw m0, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + + pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] + pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 + + pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + palignr m2, m1, m6, 14 + mova [dstq ], m2 + palignr m2, m1, m6, 12 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 10 + mova [dstq+strideq*2], m2 + palignr m2, m1, m6, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m1, m6, 6 + mova [dstq ], m2 + palignr m2, m1, m6, 4 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 2 + mova [dstq+strideq*2], m2 + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + mova [dstq+stride3q ], m6 + lea dstq, [dstq+strideq*4] + + palignr m2, m6, m4, 14 + mova [dstq ], m2 + palignr m2, m6, m4, 12 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 10 + mova [dstq+strideq*2], m2 + palignr m2, m6, m4, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m6, m4, 6 + mova [dstq ], m2 + palignr m2, m6, m4, 4 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 2 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + movu m1, [aboveq+15] + + pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] + + palignr m3, m1, m7, 1 + palignr m5, m1, m7, 2 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] + + pshufb m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m7, 15 + palignr m3, m0, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pavgb m5, m0 ; A1 - Ag + punpcklbw m6, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + pshufb m6, [GLOBAL(sh_bfedcba9876543210)] + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + + DEFINE_ARGS dst, stride, stride3, left, line + lea stride3q, [strideq*3] + + palignr m5, m2, m1, 14 + palignr m7, m1, m6, 14 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 12 + palignr m7, m1, m6, 12 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 10 + palignr m7, m1, m6, 10 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + palignr m5, m2, m1, 8 + palignr m7, m1, m6, 8 + mova [dstq+stride3q ], m7 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m2, m1, 6 + palignr m7, m1, m6, 6 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 4 + palignr m7, m1, m6, 4 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 2 + palignr m7, m1, m6, 2 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m6 + mova [dstq+stride3q+16 ], m1 + lea dstq, [dstq+strideq*4] + + palignr m5, m1, m6, 14 + palignr m3, m6, m4, 14 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 12 + palignr m3, m6, m4, 12 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 10 + palignr m3, m6, m4, 10 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + palignr m5, m1, m6, 8 + palignr m3, m6, m4, 8 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m1, m6, 6 + palignr m3, m6, m4, 6 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 4 + palignr m3, m6, m4, 4 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 2 + palignr m3, m6, m4, 2 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m4 + mova [dstq+stride3q+16 ], m6 + lea dstq, [dstq+strideq*4] + + mova m7, [leftq] + mova m3, [leftq+16] + palignr m5, m3, m7, 15 + palignr m0, m3, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - + pavgb m5, m3 ; Ah - + punpcklbw m3, m2, m5 ; A-B8 ... A-B1 + punpckhbw m2, m5 ; A-B9 ... A-Bg + pshufb m3, [GLOBAL(sh_bfedcba9876543210)] + pshufb m2, [GLOBAL(sh_bfedcba9876543210)] + + palignr m7, m6, m4, 14 + palignr m0, m4, m3, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 12 + palignr m0, m4, m3, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 10 + palignr m0, m4, m3, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m6, m4, 8 + palignr m0, m4, m3, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m6, m4, 6 + palignr m0, m4, m3, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 4 + palignr m0, m4, m3, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 2 + palignr m0, m4, m3, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m4 + lea dstq, [dstq+strideq*4] + + palignr m7, m4, m3, 14 + palignr m0, m3, m2, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 12 + palignr m0, m3, m2, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 10 + palignr m0, m3, m2, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m4, m3, 8 + palignr m0, m3, m2, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m4, m3, 6 + palignr m0, m3, m2, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 4 + palignr m0, m3, m2, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 2 + palignr m0, m3, m2, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m3 + + RESTORE_GOT + RET diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c new file mode 100644 index 000000000..5795a1845 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c @@ -0,0 +1,3631 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/inv_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +#define RECON_AND_STORE4X4(dest, in_x) \ + { \ + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + *(int *)(dest) = _mm_cvtsi128_si32(d0); \ + } + +void aom_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + const __m128i cst = _mm_setr_epi16( + (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i input0, input1, input2, input3; + + // Rows + input0 = load_input_data(input); + input2 = load_input_data(input + 8); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input0, 0xd8); + input0 = _mm_shufflehi_epi16(input0, 0xd8); + input2 = _mm_shufflelo_epi16(input2, 0xd8); + input2 = _mm_shufflehi_epi16(input2, 0xd8); + + input1 = _mm_unpackhi_epi32(input0, input0); + input0 = _mm_unpacklo_epi32(input0, input0); + input3 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpacklo_epi32(input2, input2); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, input1); + input1 = _mm_packs_epi32(input2, input3); + + // Transpose + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Columns + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_unpacklo_epi32(input2, input2); + input1 = _mm_unpackhi_epi32(input2, input2); + input2 = _mm_unpackhi_epi32(input3, input3); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, input2); + input1 = _mm_packs_epi32(input1, input3); + + // Transpose + input2 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpackhi_epi16(input0, input1); + input0 = _mm_unpacklo_epi32(input2, input3); + input1 = _mm_unpackhi_epi32(input2, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Final round and shift + input2 = _mm_add_epi16(input2, eight); + input3 = _mm_add_epi16(input3, eight); + + input2 = _mm_srai_epi16(input2, 4); + input3 = _mm_srai_epi16(input3, 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *)(dest + stride))); + d2 = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, input2); + d2 = _mm_add_epi16(d2, input3); + d0 = _mm_packus_epi16(d0, d2); + // store input0 + *(int *)dest = _mm_cvtsi128_si32(d0); + // store input1 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store input2 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + // store input3 + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + } +} + +void aom_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 4); + + if (a == 0) return; + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE4X4(dest + 0 * stride, dc_value); + RECON_AND_STORE4X4(dest + 1 * stride, dc_value); + RECON_AND_STORE4X4(dest + 2 * stride, dc_value); + RECON_AND_STORE4X4(dest + 3 * stride, dc_value); +} + +void aom_idct4_sse2(__m128i *in) { + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8]; + + array_transpose_4x4(in); + // stage 1 + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[3], v[2]); + + // stage 2 + in[0] = _mm_add_epi16(u[0], u[1]); + in[1] = _mm_sub_epi16(u[0], u[1]); + in[1] = _mm_shuffle_epi32(in[1], 0x4E); +} + +void aom_iadst4_sse2(__m128i *in) { + const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); + const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); + const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); + const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); + const __m128i kZero = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i u[8], v[8], in7; + + array_transpose_4x4(in); + in7 = _mm_srli_si128(in[1], 8); + in7 = _mm_add_epi16(in7, in[0]); + in7 = _mm_sub_epi16(in7, in[1]); + + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); + u[2] = _mm_unpacklo_epi16(in7, kZero); + u[3] = _mm_unpackhi_epi16(in[0], kZero); + + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 + v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 + + u[0] = _mm_add_epi32(v[0], v[1]); + u[1] = _mm_add_epi32(v[3], v[4]); + u[2] = v[2]; + u[3] = _mm_add_epi32(u[0], u[1]); + u[4] = _mm_slli_epi32(v[5], 2); + u[5] = _mm_add_epi32(u[3], v[5]); + u[6] = _mm_sub_epi32(u[5], u[4]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); +} + +// Define Macro for multiplying elements by constants and adding them together. +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ + res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ + } + +#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + } + +#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ + out4, out5, out6, out7) \ + { \ + /* Stage1 */ \ + { \ + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ + \ + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ + stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ + stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + } \ + \ + /* Stage4 */ \ + out0 = _mm_adds_epi16(stp1_0, stp2_7); \ + out1 = _mm_adds_epi16(stp1_1, stp1_6); \ + out2 = _mm_adds_epi16(stp1_2, stp1_5); \ + out3 = _mm_adds_epi16(stp1_3, stp2_4); \ + out4 = _mm_subs_epi16(stp1_3, stp2_4); \ + out5 = _mm_subs_epi16(stp1_2, stp1_5); \ + out6 = _mm_subs_epi16(stp1_1, stp1_6); \ + out7 = _mm_subs_epi16(stp1_0, stp2_7); \ + } + +void aom_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + in4 = load_input_data(input + 8 * 4); + in5 = load_input_data(input + 8 * 5); + in6 = load_input_data(input + 8 * 6); + in7 = load_input_data(input + 8 * 7); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from aom_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, + in6, in7); + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +void aom_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 5); + + if (a == 0) return; + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE(dest + 0 * stride, dc_value); + RECON_AND_STORE(dest + 1 * stride, dc_value); + RECON_AND_STORE(dest + 2 * stride, dc_value); + RECON_AND_STORE(dest + 3 * stride, dc_value); + RECON_AND_STORE(dest + 4 * stride, dc_value); + RECON_AND_STORE(dest + 5 * stride, dc_value); + RECON_AND_STORE(dest + 6 * stride, dc_value); + RECON_AND_STORE(dest + 7 * stride, dc_value); +} + +void aom_idct8_sse2(__m128i *in) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // 8x8 Transpose is copied from aom_fdct8x8_sse2() + TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, + in1, in2, in3, in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], + in[4], in[5], in[6], in[7]); +} + +void aom_iadst8_sse2(__m128i *in) { + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__const_0 = _mm_set1_epi16(0); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + + // transpose + array_transpose_8x8(in, in); + + // properly aligned for butterfly input + in0 = in[7]; + in1 = in[0]; + in2 = in[5]; + in3 = in[2]; + in4 = in[3]; + in5 = in[4]; + in6 = in[1]; + in7 = in[6]; + + // column transformation + // stage 1 + // interleave and multiply/add into 32-bit integer + s0 = _mm_unpacklo_epi16(in0, in1); + s1 = _mm_unpackhi_epi16(in0, in1); + s2 = _mm_unpacklo_epi16(in2, in3); + s3 = _mm_unpackhi_epi16(in2, in3); + s4 = _mm_unpacklo_epi16(in4, in5); + s5 = _mm_unpackhi_epi16(in4, in5); + s6 = _mm_unpacklo_epi16(in6, in7); + s7 = _mm_unpackhi_epi16(in6, in7); + + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); + + // addition + w0 = _mm_add_epi32(u0, u8); + w1 = _mm_add_epi32(u1, u9); + w2 = _mm_add_epi32(u2, u10); + w3 = _mm_add_epi32(u3, u11); + w4 = _mm_add_epi32(u4, u12); + w5 = _mm_add_epi32(u5, u13); + w6 = _mm_add_epi32(u6, u14); + w7 = _mm_add_epi32(u7, u15); + w8 = _mm_sub_epi32(u0, u8); + w9 = _mm_sub_epi32(u1, u9); + w10 = _mm_sub_epi32(u2, u10); + w11 = _mm_sub_epi32(u3, u11); + w12 = _mm_sub_epi32(u4, u12); + w13 = _mm_sub_epi32(u5, u13); + w14 = _mm_sub_epi32(u6, u14); + w15 = _mm_sub_epi32(u7, u15); + + // shift and rounding + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); + + // back to 16-bit and pack 8 integers into __m128i + in[0] = _mm_packs_epi32(u0, u1); + in[1] = _mm_packs_epi32(u2, u3); + in[2] = _mm_packs_epi32(u4, u5); + in[3] = _mm_packs_epi32(u6, u7); + in[4] = _mm_packs_epi32(u8, u9); + in[5] = _mm_packs_epi32(u10, u11); + in[6] = _mm_packs_epi32(u12, u13); + in[7] = _mm_packs_epi32(u14, u15); + + // stage 2 + s0 = _mm_add_epi16(in[0], in[2]); + s1 = _mm_add_epi16(in[1], in[3]); + s2 = _mm_sub_epi16(in[0], in[2]); + s3 = _mm_sub_epi16(in[1], in[3]); + u0 = _mm_unpacklo_epi16(in[4], in[5]); + u1 = _mm_unpackhi_epi16(in[4], in[5]); + u2 = _mm_unpacklo_epi16(in[6], in[7]); + u3 = _mm_unpackhi_epi16(in[6], in[7]); + + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); + + w0 = _mm_add_epi32(v0, v4); + w1 = _mm_add_epi32(v1, v5); + w2 = _mm_add_epi32(v2, v6); + w3 = _mm_add_epi32(v3, v7); + w4 = _mm_sub_epi32(v0, v4); + w5 = _mm_sub_epi32(v1, v5); + w6 = _mm_sub_epi32(v2, v6); + w7 = _mm_sub_epi32(v3, v7); + + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); + + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + // back to 16-bit intergers + s4 = _mm_packs_epi32(u0, u1); + s5 = _mm_packs_epi32(u2, u3); + s6 = _mm_packs_epi32(u4, u5); + s7 = _mm_packs_epi32(u6, u7); + + // stage 3 + u0 = _mm_unpacklo_epi16(s2, s3); + u1 = _mm_unpackhi_epi16(s2, s3); + u2 = _mm_unpacklo_epi16(s6, s7); + u3 = _mm_unpackhi_epi16(s6, s7); + + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); + + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + s2 = _mm_packs_epi32(v0, v1); + s3 = _mm_packs_epi32(v2, v3); + s6 = _mm_packs_epi32(v4, v5); + s7 = _mm_packs_epi32(v6, v7); + + in[0] = s0; + in[1] = _mm_sub_epi16(k__const_0, s4); + in[2] = s6; + in[3] = _mm_sub_epi16(k__const_0, s2); + in[4] = s3; + in[5] = _mm_sub_epi16(k__const_0, s7); + in[6] = s5; + in[7] = _mm_sub_epi16(k__const_0, s1); +} + +void aom_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // Rows. Load 4-row input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + + // 8x4 Transpose + TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); + // Stage1 + { + const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); + const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); + + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + stp1_5 = _mm_packs_epi32(tmp4, tmp6); + } + + // Stage2 + { + const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); + const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); + + tmp0 = _mm_madd_epi16(lo_04, stg2_0); + tmp2 = _mm_madd_epi16(lo_04, stg2_1); + tmp4 = _mm_madd_epi16(lo_26, stg2_2); + tmp6 = _mm_madd_epi16(lo_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, tmp2); + stp2_2 = _mm_packs_epi32(tmp6, tmp4); + + tmp0 = _mm_adds_epi16(stp1_4, stp1_5); + tmp1 = _mm_subs_epi16(stp1_4, stp1_5); + + stp2_4 = tmp0; + stp2_5 = _mm_unpacklo_epi64(tmp1, zero); + stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage3 + { + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); + + tmp4 = _mm_adds_epi16(stp2_0, stp2_2); + tmp6 = _mm_subs_epi16(stp2_0, stp2_2); + + stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); + stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); + + tmp0 = _mm_madd_epi16(lo_56, stg3_0); + tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp2); + } + + // Stage4 + tmp0 = _mm_adds_epi16(stp1_3, stp2_4); + tmp1 = _mm_adds_epi16(stp1_2, stp1_5); + tmp2 = _mm_subs_epi16(stp1_3, stp2_4); + tmp3 = _mm_subs_epi16(stp1_2, stp1_5); + + TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + + IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, + in5, in6, in7); + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +#define IDCT16 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ + stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ + \ + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ + stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ + stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ + \ + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + \ + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ + stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + } + +#define IDCT16_10 \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ + stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ + stp1_12_0) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ + \ + stp1_9 = stp1_8_0; \ + stp1_10 = stp1_11; \ + \ + stp1_13 = stp1_12_0; \ + stp1_14 = stp1_15; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ + stp2_5 = stp2_4; \ + stp2_6 = stp2_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_2 = stp1_1; \ + stp1_3 = stp1_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + } + +void aom_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[16], l[16], r[16], *curr1; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + curr1 = l; + for (i = 0; i < 2; i++) { + // 1-D idct + + // Load input data. + in[0] = load_input_data(input); + in[8] = load_input_data(input + 8 * 1); + in[1] = load_input_data(input + 8 * 2); + in[9] = load_input_data(input + 8 * 3); + in[2] = load_input_data(input + 8 * 4); + in[10] = load_input_data(input + 8 * 5); + in[3] = load_input_data(input + 8 * 6); + in[11] = load_input_data(input + 8 * 7); + in[4] = load_input_data(input + 8 * 8); + in[12] = load_input_data(input + 8 * 9); + in[5] = load_input_data(input + 8 * 10); + in[13] = load_input_data(input + 8 * 11); + in[6] = load_input_data(input + 8 * 12); + in[14] = load_input_data(input + 8 * 13); + in[7] = load_input_data(input + 8 * 14); + in[15] = load_input_data(input + 8 * 15); + + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + + IDCT16 + + // Stage7 + curr1[0] = _mm_add_epi16(stp2_0, stp1_15); + curr1[1] = _mm_add_epi16(stp2_1, stp1_14); + curr1[2] = _mm_add_epi16(stp2_2, stp2_13); + curr1[3] = _mm_add_epi16(stp2_3, stp2_12); + curr1[4] = _mm_add_epi16(stp2_4, stp2_11); + curr1[5] = _mm_add_epi16(stp2_5, stp2_10); + curr1[6] = _mm_add_epi16(stp2_6, stp1_9); + curr1[7] = _mm_add_epi16(stp2_7, stp1_8); + curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); + curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); + curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); + curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); + curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); + curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); + curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); + curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); + + curr1 = r; + input += 128; + } + for (i = 0; i < 2; i++) { + int j; + // 1-D idct + array_transpose_8x8(l + i * 8, in); + array_transpose_8x8(r + i * 8, in + 8); + + IDCT16 + + // 2-D + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); + + for (j = 0; j < 16; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + if (a == 0) return; + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 16; ++i) { + RECON_AND_STORE(dest + 0, dc_value); + RECON_AND_STORE(dest + 8, dc_value); + dest += stride; + } +} + +void iadst16_8col(__m128i *in) { + // perform 16x16 1-D ADST for 8 columns + __m128i s[16], x[16], u[32], v[32]; + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); + const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kZero = _mm_set1_epi16(0); + + u[0] = _mm_unpacklo_epi16(in[15], in[0]); + u[1] = _mm_unpackhi_epi16(in[15], in[0]); + u[2] = _mm_unpacklo_epi16(in[13], in[2]); + u[3] = _mm_unpackhi_epi16(in[13], in[2]); + u[4] = _mm_unpacklo_epi16(in[11], in[4]); + u[5] = _mm_unpackhi_epi16(in[11], in[4]); + u[6] = _mm_unpacklo_epi16(in[9], in[6]); + u[7] = _mm_unpackhi_epi16(in[9], in[6]); + u[8] = _mm_unpacklo_epi16(in[7], in[8]); + u[9] = _mm_unpackhi_epi16(in[7], in[8]); + u[10] = _mm_unpacklo_epi16(in[5], in[10]); + u[11] = _mm_unpackhi_epi16(in[5], in[10]); + u[12] = _mm_unpacklo_epi16(in[3], in[12]); + u[13] = _mm_unpackhi_epi16(in[3], in[12]); + u[14] = _mm_unpacklo_epi16(in[1], in[14]); + u[15] = _mm_unpackhi_epi16(in[1], in[14]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); + + u[0] = _mm_add_epi32(v[0], v[16]); + u[1] = _mm_add_epi32(v[1], v[17]); + u[2] = _mm_add_epi32(v[2], v[18]); + u[3] = _mm_add_epi32(v[3], v[19]); + u[4] = _mm_add_epi32(v[4], v[20]); + u[5] = _mm_add_epi32(v[5], v[21]); + u[6] = _mm_add_epi32(v[6], v[22]); + u[7] = _mm_add_epi32(v[7], v[23]); + u[8] = _mm_add_epi32(v[8], v[24]); + u[9] = _mm_add_epi32(v[9], v[25]); + u[10] = _mm_add_epi32(v[10], v[26]); + u[11] = _mm_add_epi32(v[11], v[27]); + u[12] = _mm_add_epi32(v[12], v[28]); + u[13] = _mm_add_epi32(v[13], v[29]); + u[14] = _mm_add_epi32(v[14], v[30]); + u[15] = _mm_add_epi32(v[15], v[31]); + u[16] = _mm_sub_epi32(v[0], v[16]); + u[17] = _mm_sub_epi32(v[1], v[17]); + u[18] = _mm_sub_epi32(v[2], v[18]); + u[19] = _mm_sub_epi32(v[3], v[19]); + u[20] = _mm_sub_epi32(v[4], v[20]); + u[21] = _mm_sub_epi32(v[5], v[21]); + u[22] = _mm_sub_epi32(v[6], v[22]); + u[23] = _mm_sub_epi32(v[7], v[23]); + u[24] = _mm_sub_epi32(v[8], v[24]); + u[25] = _mm_sub_epi32(v[9], v[25]); + u[26] = _mm_sub_epi32(v[10], v[26]); + u[27] = _mm_sub_epi32(v[11], v[27]); + u[28] = _mm_sub_epi32(v[12], v[28]); + u[29] = _mm_sub_epi32(v[13], v[29]); + u[30] = _mm_sub_epi32(v[14], v[30]); + u[31] = _mm_sub_epi32(v[15], v[31]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_packs_epi32(u[8], u[9]); + s[5] = _mm_packs_epi32(u[10], u[11]); + s[6] = _mm_packs_epi32(u[12], u[13]); + s[7] = _mm_packs_epi32(u[14], u[15]); + s[8] = _mm_packs_epi32(u[16], u[17]); + s[9] = _mm_packs_epi32(u[18], u[19]); + s[10] = _mm_packs_epi32(u[20], u[21]); + s[11] = _mm_packs_epi32(u[22], u[23]); + s[12] = _mm_packs_epi32(u[24], u[25]); + s[13] = _mm_packs_epi32(u[26], u[27]); + s[14] = _mm_packs_epi32(u[28], u[29]); + s[15] = _mm_packs_epi32(u[30], u[31]); + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); + u[1] = _mm_unpackhi_epi16(s[8], s[9]); + u[2] = _mm_unpacklo_epi16(s[10], s[11]); + u[3] = _mm_unpackhi_epi16(s[10], s[11]); + u[4] = _mm_unpacklo_epi16(s[12], s[13]); + u[5] = _mm_unpackhi_epi16(s[12], s[13]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], v[8]); + u[1] = _mm_add_epi32(v[1], v[9]); + u[2] = _mm_add_epi32(v[2], v[10]); + u[3] = _mm_add_epi32(v[3], v[11]); + u[4] = _mm_add_epi32(v[4], v[12]); + u[5] = _mm_add_epi32(v[5], v[13]); + u[6] = _mm_add_epi32(v[6], v[14]); + u[7] = _mm_add_epi32(v[7], v[15]); + u[8] = _mm_sub_epi32(v[0], v[8]); + u[9] = _mm_sub_epi32(v[1], v[9]); + u[10] = _mm_sub_epi32(v[2], v[10]); + u[11] = _mm_sub_epi32(v[3], v[11]); + u[12] = _mm_sub_epi32(v[4], v[12]); + u[13] = _mm_sub_epi32(v[5], v[13]); + u[14] = _mm_sub_epi32(v[6], v[14]); + u[15] = _mm_sub_epi32(v[7], v[15]); + + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); + + x[0] = _mm_add_epi16(s[0], s[4]); + x[1] = _mm_add_epi16(s[1], s[5]); + x[2] = _mm_add_epi16(s[2], s[6]); + x[3] = _mm_add_epi16(s[3], s[7]); + x[4] = _mm_sub_epi16(s[0], s[4]); + x[5] = _mm_sub_epi16(s[1], s[5]); + x[6] = _mm_sub_epi16(s[2], s[6]); + x[7] = _mm_sub_epi16(s[3], s[7]); + x[8] = _mm_packs_epi32(u[0], u[1]); + x[9] = _mm_packs_epi32(u[2], u[3]); + x[10] = _mm_packs_epi32(u[4], u[5]); + x[11] = _mm_packs_epi32(u[6], u[7]); + x[12] = _mm_packs_epi32(u[8], u[9]); + x[13] = _mm_packs_epi32(u[10], u[11]); + x[14] = _mm_packs_epi32(u[12], u[13]); + x[15] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); + u[1] = _mm_unpackhi_epi16(x[4], x[5]); + u[2] = _mm_unpacklo_epi16(x[6], x[7]); + u[3] = _mm_unpackhi_epi16(x[6], x[7]); + u[4] = _mm_unpacklo_epi16(x[12], x[13]); + u[5] = _mm_unpackhi_epi16(x[12], x[13]); + u[6] = _mm_unpacklo_epi16(x[14], x[15]); + u[7] = _mm_unpackhi_epi16(x[14], x[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); + + u[0] = _mm_add_epi32(v[0], v[4]); + u[1] = _mm_add_epi32(v[1], v[5]); + u[2] = _mm_add_epi32(v[2], v[6]); + u[3] = _mm_add_epi32(v[3], v[7]); + u[4] = _mm_sub_epi32(v[0], v[4]); + u[5] = _mm_sub_epi32(v[1], v[5]); + u[6] = _mm_sub_epi32(v[2], v[6]); + u[7] = _mm_sub_epi32(v[3], v[7]); + u[8] = _mm_add_epi32(v[8], v[12]); + u[9] = _mm_add_epi32(v[9], v[13]); + u[10] = _mm_add_epi32(v[10], v[14]); + u[11] = _mm_add_epi32(v[11], v[15]); + u[12] = _mm_sub_epi32(v[8], v[12]); + u[13] = _mm_sub_epi32(v[9], v[13]); + u[14] = _mm_sub_epi32(v[10], v[14]); + u[15] = _mm_sub_epi32(v[11], v[15]); + + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_add_epi16(x[0], x[2]); + s[1] = _mm_add_epi16(x[1], x[3]); + s[2] = _mm_sub_epi16(x[0], x[2]); + s[3] = _mm_sub_epi16(x[1], x[3]); + s[4] = _mm_packs_epi32(v[0], v[1]); + s[5] = _mm_packs_epi32(v[2], v[3]); + s[6] = _mm_packs_epi32(v[4], v[5]); + s[7] = _mm_packs_epi32(v[6], v[7]); + s[8] = _mm_add_epi16(x[8], x[10]); + s[9] = _mm_add_epi16(x[9], x[11]); + s[10] = _mm_sub_epi16(x[8], x[10]); + s[11] = _mm_sub_epi16(x[9], x[11]); + s[12] = _mm_packs_epi32(v[8], v[9]); + s[13] = _mm_packs_epi32(v[10], v[11]); + s[14] = _mm_packs_epi32(v[12], v[13]); + s[15] = _mm_packs_epi32(v[14], v[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); + u[1] = _mm_unpackhi_epi16(s[2], s[3]); + u[2] = _mm_unpacklo_epi16(s[6], s[7]); + u[3] = _mm_unpackhi_epi16(s[6], s[7]); + u[4] = _mm_unpacklo_epi16(s[10], s[11]); + u[5] = _mm_unpackhi_epi16(s[10], s[11]); + u[6] = _mm_unpacklo_epi16(s[14], s[15]); + u[7] = _mm_unpackhi_epi16(s[14], s[15]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + in[0] = s[0]; + in[1] = _mm_sub_epi16(kZero, s[8]); + in[2] = s[12]; + in[3] = _mm_sub_epi16(kZero, s[4]); + in[4] = _mm_packs_epi32(v[4], v[5]); + in[5] = _mm_packs_epi32(v[12], v[13]); + in[6] = _mm_packs_epi32(v[8], v[9]); + in[7] = _mm_packs_epi32(v[0], v[1]); + in[8] = _mm_packs_epi32(v[2], v[3]); + in[9] = _mm_packs_epi32(v[10], v[11]); + in[10] = _mm_packs_epi32(v[14], v[15]); + in[11] = _mm_packs_epi32(v[6], v[7]); + in[12] = s[5]; + in[13] = _mm_sub_epi16(kZero, s[13]); + in[14] = s[9]; + in[15] = _mm_sub_epi16(kZero, s[1]); +} + +void idct16_8col(__m128i *in) { + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i v[16], u[16], s[16], t[16]; + + // stage 1 + s[0] = in[0]; + s[1] = in[8]; + s[2] = in[4]; + s[3] = in[12]; + s[4] = in[2]; + s[5] = in[10]; + s[6] = in[6]; + s[7] = in[14]; + s[8] = in[1]; + s[9] = in[9]; + s[10] = in[5]; + s[11] = in[13]; + s[12] = in[3]; + s[13] = in[11]; + s[14] = in[7]; + s[15] = in[15]; + + // stage 2 + u[0] = _mm_unpacklo_epi16(s[8], s[15]); + u[1] = _mm_unpackhi_epi16(s[8], s[15]); + u[2] = _mm_unpacklo_epi16(s[9], s[14]); + u[3] = _mm_unpackhi_epi16(s[9], s[14]); + u[4] = _mm_unpacklo_epi16(s[10], s[13]); + u[5] = _mm_unpackhi_epi16(s[10], s[13]); + u[6] = _mm_unpacklo_epi16(s[11], s[12]); + u[7] = _mm_unpackhi_epi16(s[11], s[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); + v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); + v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); + v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); + v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); + v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); + v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); + v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); + v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); + v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); + v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); + v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); + v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); + v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); + v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[8] = _mm_packs_epi32(u[0], u[1]); + s[15] = _mm_packs_epi32(u[2], u[3]); + s[9] = _mm_packs_epi32(u[4], u[5]); + s[14] = _mm_packs_epi32(u[6], u[7]); + s[10] = _mm_packs_epi32(u[8], u[9]); + s[13] = _mm_packs_epi32(u[10], u[11]); + s[11] = _mm_packs_epi32(u[12], u[13]); + s[12] = _mm_packs_epi32(u[14], u[15]); + + // stage 3 + t[0] = s[0]; + t[1] = s[1]; + t[2] = s[2]; + t[3] = s[3]; + u[0] = _mm_unpacklo_epi16(s[4], s[7]); + u[1] = _mm_unpackhi_epi16(s[4], s[7]); + u[2] = _mm_unpacklo_epi16(s[5], s[6]); + u[3] = _mm_unpackhi_epi16(s[5], s[6]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); + v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); + v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); + v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); + v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); + v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); + v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + t[4] = _mm_packs_epi32(u[0], u[1]); + t[7] = _mm_packs_epi32(u[2], u[3]); + t[5] = _mm_packs_epi32(u[4], u[5]); + t[6] = _mm_packs_epi32(u[6], u[7]); + t[8] = _mm_add_epi16(s[8], s[9]); + t[9] = _mm_sub_epi16(s[8], s[9]); + t[10] = _mm_sub_epi16(s[11], s[10]); + t[11] = _mm_add_epi16(s[10], s[11]); + t[12] = _mm_add_epi16(s[12], s[13]); + t[13] = _mm_sub_epi16(s[12], s[13]); + t[14] = _mm_sub_epi16(s[15], s[14]); + t[15] = _mm_add_epi16(s[14], s[15]); + + // stage 4 + u[0] = _mm_unpacklo_epi16(t[0], t[1]); + u[1] = _mm_unpackhi_epi16(t[0], t[1]); + u[2] = _mm_unpacklo_epi16(t[2], t[3]); + u[3] = _mm_unpackhi_epi16(t[2], t[3]); + u[4] = _mm_unpacklo_epi16(t[9], t[14]); + u[5] = _mm_unpackhi_epi16(t[9], t[14]); + u[6] = _mm_unpacklo_epi16(t[10], t[13]); + u[7] = _mm_unpackhi_epi16(t[10], t[13]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); + v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); + v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); + v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); + v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); + v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); + v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); + + s[0] = _mm_packs_epi32(u[0], u[1]); + s[1] = _mm_packs_epi32(u[2], u[3]); + s[2] = _mm_packs_epi32(u[4], u[5]); + s[3] = _mm_packs_epi32(u[6], u[7]); + s[4] = _mm_add_epi16(t[4], t[5]); + s[5] = _mm_sub_epi16(t[4], t[5]); + s[6] = _mm_sub_epi16(t[7], t[6]); + s[7] = _mm_add_epi16(t[6], t[7]); + s[8] = t[8]; + s[15] = t[15]; + s[9] = _mm_packs_epi32(u[8], u[9]); + s[14] = _mm_packs_epi32(u[10], u[11]); + s[10] = _mm_packs_epi32(u[12], u[13]); + s[13] = _mm_packs_epi32(u[14], u[15]); + s[11] = t[11]; + s[12] = t[12]; + + // stage 5 + t[0] = _mm_add_epi16(s[0], s[3]); + t[1] = _mm_add_epi16(s[1], s[2]); + t[2] = _mm_sub_epi16(s[1], s[2]); + t[3] = _mm_sub_epi16(s[0], s[3]); + t[4] = s[4]; + t[7] = s[7]; + + u[0] = _mm_unpacklo_epi16(s[5], s[6]); + u[1] = _mm_unpackhi_epi16(s[5], s[6]); + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + t[5] = _mm_packs_epi32(u[0], u[1]); + t[6] = _mm_packs_epi32(u[2], u[3]); + + t[8] = _mm_add_epi16(s[8], s[11]); + t[9] = _mm_add_epi16(s[9], s[10]); + t[10] = _mm_sub_epi16(s[9], s[10]); + t[11] = _mm_sub_epi16(s[8], s[11]); + t[12] = _mm_sub_epi16(s[15], s[12]); + t[13] = _mm_sub_epi16(s[14], s[13]); + t[14] = _mm_add_epi16(s[13], s[14]); + t[15] = _mm_add_epi16(s[12], s[15]); + + // stage 6 + s[0] = _mm_add_epi16(t[0], t[7]); + s[1] = _mm_add_epi16(t[1], t[6]); + s[2] = _mm_add_epi16(t[2], t[5]); + s[3] = _mm_add_epi16(t[3], t[4]); + s[4] = _mm_sub_epi16(t[3], t[4]); + s[5] = _mm_sub_epi16(t[2], t[5]); + s[6] = _mm_sub_epi16(t[1], t[6]); + s[7] = _mm_sub_epi16(t[0], t[7]); + s[8] = t[8]; + s[9] = t[9]; + + u[0] = _mm_unpacklo_epi16(t[10], t[13]); + u[1] = _mm_unpackhi_epi16(t[10], t[13]); + u[2] = _mm_unpacklo_epi16(t[11], t[12]); + u[3] = _mm_unpackhi_epi16(t[11], t[12]); + + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); + v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); + v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); + v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); + v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); + + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); + + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); + + s[10] = _mm_packs_epi32(u[0], u[1]); + s[13] = _mm_packs_epi32(u[2], u[3]); + s[11] = _mm_packs_epi32(u[4], u[5]); + s[12] = _mm_packs_epi32(u[6], u[7]); + s[14] = t[14]; + s[15] = t[15]; + + // stage 7 + in[0] = _mm_add_epi16(s[0], s[15]); + in[1] = _mm_add_epi16(s[1], s[14]); + in[2] = _mm_add_epi16(s[2], s[13]); + in[3] = _mm_add_epi16(s[3], s[12]); + in[4] = _mm_add_epi16(s[4], s[11]); + in[5] = _mm_add_epi16(s[5], s[10]); + in[6] = _mm_add_epi16(s[6], s[9]); + in[7] = _mm_add_epi16(s[7], s[8]); + in[8] = _mm_sub_epi16(s[7], s[8]); + in[9] = _mm_sub_epi16(s[6], s[9]); + in[10] = _mm_sub_epi16(s[5], s[10]); + in[11] = _mm_sub_epi16(s[4], s[11]); + in[12] = _mm_sub_epi16(s[3], s[12]); + in[13] = _mm_sub_epi16(s[2], s[13]); + in[14] = _mm_sub_epi16(s[1], s[14]); + in[15] = _mm_sub_epi16(s[0], s[15]); +} + +void aom_idct16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + idct16_8col(in0); + idct16_8col(in1); +} + +void aom_iadst16_sse2(__m128i *in0, __m128i *in1) { + array_transpose_16x16(in0, in1); + iadst16_8col(in0); + iadst16_8col(in1); +} + +void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i in[16], l[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, + stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, + stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + // First 1-D inverse DCT + // Load input data. + in[0] = load_input_data(input); + in[1] = load_input_data(input + 8 * 2); + in[2] = load_input_data(input + 8 * 4); + in[3] = load_input_data(input + 8 * 6); + + TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); + const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); + + tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); + tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); + tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); + tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_8 = _mm_packs_epi32(tmp0, tmp2); + stp2_11 = _mm_packs_epi32(tmp5, tmp7); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); + + tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); + tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); + stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); + + stp1_4 = _mm_packs_epi32(tmp0, tmp2); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); + + tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); + tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp1_0 = _mm_packs_epi32(tmp0, tmp0); + stp1_1 = _mm_packs_epi32(tmp2, tmp2); + stp2_9 = _mm_packs_epi32(tmp1, tmp3); + stp2_10 = _mm_packs_epi32(tmp5, tmp7); + + stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); + } + + // Stage5 and Stage6 + { + tmp0 = _mm_add_epi16(stp2_8, stp2_11); + tmp1 = _mm_sub_epi16(stp2_8, stp2_11); + tmp2 = _mm_add_epi16(stp2_9, stp2_10); + tmp3 = _mm_sub_epi16(stp2_9, stp2_10); + + stp1_9 = _mm_unpacklo_epi64(tmp2, zero); + stp1_10 = _mm_unpacklo_epi64(tmp3, zero); + stp1_8 = _mm_unpacklo_epi64(tmp0, zero); + stp1_11 = _mm_unpacklo_epi64(tmp1, zero); + + stp1_13 = _mm_unpackhi_epi64(tmp3, zero); + stp1_14 = _mm_unpackhi_epi64(tmp2, zero); + stp1_12 = _mm_unpackhi_epi64(tmp1, zero); + stp1_15 = _mm_unpackhi_epi64(tmp0, zero); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + + tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); + tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); + tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_6 = _mm_packs_epi32(tmp3, tmp1); + + stp2_10 = _mm_packs_epi32(tmp0, zero); + stp2_13 = _mm_packs_epi32(tmp2, zero); + stp2_11 = _mm_packs_epi32(tmp4, zero); + stp2_12 = _mm_packs_epi32(tmp6, zero); + + tmp0 = _mm_add_epi16(stp1_0, stp1_4); + tmp1 = _mm_sub_epi16(stp1_0, stp1_4); + tmp2 = _mm_add_epi16(stp1_1, stp1_6); + tmp3 = _mm_sub_epi16(stp1_1, stp1_6); + + stp2_0 = _mm_unpackhi_epi64(tmp0, zero); + stp2_1 = _mm_unpacklo_epi64(tmp2, zero); + stp2_2 = _mm_unpackhi_epi64(tmp2, zero); + stp2_3 = _mm_unpacklo_epi64(tmp0, zero); + stp2_4 = _mm_unpacklo_epi64(tmp1, zero); + stp2_5 = _mm_unpackhi_epi64(tmp3, zero); + stp2_6 = _mm_unpacklo_epi64(tmp3, zero); + stp2_7 = _mm_unpackhi_epi64(tmp1, zero); + } + + // Stage7. Left 8x16 only. + l[0] = _mm_add_epi16(stp2_0, stp1_15); + l[1] = _mm_add_epi16(stp2_1, stp1_14); + l[2] = _mm_add_epi16(stp2_2, stp2_13); + l[3] = _mm_add_epi16(stp2_3, stp2_12); + l[4] = _mm_add_epi16(stp2_4, stp2_11); + l[5] = _mm_add_epi16(stp2_5, stp2_10); + l[6] = _mm_add_epi16(stp2_6, stp1_9); + l[7] = _mm_add_epi16(stp2_7, stp1_8); + l[8] = _mm_sub_epi16(stp2_7, stp1_8); + l[9] = _mm_sub_epi16(stp2_6, stp1_9); + l[10] = _mm_sub_epi16(stp2_5, stp2_10); + l[11] = _mm_sub_epi16(stp2_4, stp2_11); + l[12] = _mm_sub_epi16(stp2_3, stp2_12); + l[13] = _mm_sub_epi16(stp2_2, stp2_13); + l[14] = _mm_sub_epi16(stp2_1, stp1_14); + l[15] = _mm_sub_epi16(stp2_0, stp1_15); + + // Second 1-D inverse transform, performed per 8x16 block + for (i = 0; i < 2; i++) { + int j; + array_transpose_4X8(l + 8 * i, in); + + IDCT16_10 + + // Stage7 + in[0] = _mm_add_epi16(stp2_0, stp1_15); + in[1] = _mm_add_epi16(stp2_1, stp1_14); + in[2] = _mm_add_epi16(stp2_2, stp2_13); + in[3] = _mm_add_epi16(stp2_3, stp2_12); + in[4] = _mm_add_epi16(stp2_4, stp2_11); + in[5] = _mm_add_epi16(stp2_5, stp2_10); + in[6] = _mm_add_epi16(stp2_6, stp1_9); + in[7] = _mm_add_epi16(stp2_7, stp1_8); + in[8] = _mm_sub_epi16(stp2_7, stp1_8); + in[9] = _mm_sub_epi16(stp2_6, stp1_9); + in[10] = _mm_sub_epi16(stp2_5, stp2_10); + in[11] = _mm_sub_epi16(stp2_4, stp2_11); + in[12] = _mm_sub_epi16(stp2_3, stp2_12); + in[13] = _mm_sub_epi16(stp2_2, stp2_13); + in[14] = _mm_sub_epi16(stp2_1, stp1_14); + in[15] = _mm_sub_epi16(stp2_0, stp1_15); + + for (j = 0; j < 16; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +#define LOAD_DQCOEFF(reg, input) \ + { \ + reg = load_input_data(input); \ + input += 8; \ + } + +#define IDCT32_34 \ + /* Stage1 */ \ + { \ + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ + \ + const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ + \ + const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ + stp1_31); \ + MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ + stp1_28); \ + MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ + stp1_27); \ + MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ + stp1_24); \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ + \ + const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ + \ + MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ + stp2_15); \ + MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ + stp2_12); \ + \ + stp2_16 = stp1_16; \ + stp2_19 = stp1_19; \ + \ + stp2_20 = stp1_20; \ + stp2_23 = stp1_23; \ + \ + stp2_24 = stp1_24; \ + stp2_27 = stp1_27; \ + \ + stp2_28 = stp1_28; \ + stp2_31 = stp1_31; \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ + \ + MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ + stp1_7); \ + \ + stp1_8 = stp2_8; \ + stp1_11 = stp2_11; \ + stp1_12 = stp2_12; \ + stp1_15 = stp2_15; \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ + stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ + stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ + \ + MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ + stp2_1); \ + \ + stp2_4 = stp1_4; \ + stp2_5 = stp1_4; \ + stp2_6 = stp1_7; \ + stp2_7 = stp1_7; \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = stp2_0; \ + stp1_1 = stp2_1; \ + stp1_2 = stp2_1; \ + stp1_3 = stp2_0; \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ + stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ + } \ + \ + /* Stage7 */ \ + { \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ + stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } + +#define IDCT32(in0, in1) \ + /* Stage1 */ \ + { \ + const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]); \ + const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]); \ + const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]); \ + const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]); \ + \ + const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]); \ + const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]); \ + const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]); \ + const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]); \ + \ + const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]); \ + const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]); \ + const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]); \ + const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]); \ + \ + const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]); \ + const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]); \ + const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]); \ + const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]); \ + \ + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ + stp1_30) \ + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ + stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ + stp1_21, stp1_26) \ + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ + stp1_23, stp1_24) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]); \ + const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]); \ + const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]); \ + const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]); \ + \ + const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]); \ + const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]); \ + const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]); \ + const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]); \ + \ + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ + stp2_14) \ + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ + \ + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ + \ + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]); \ + const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]); \ + const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]); \ + const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]); \ + \ + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + \ + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ + stp1_6) \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + \ + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ + stp1_29) \ + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ + stp1_25) \ + \ + stp1_16 = stp2_16; \ + stp1_31 = stp2_31; \ + stp1_19 = stp2_19; \ + stp1_20 = stp2_20; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_27 = stp2_27; \ + stp1_28 = stp2_28; \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]); \ + const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]); \ + const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]); \ + const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ + stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ + stp2_13) \ + \ + stp2_8 = stp1_8; \ + stp2_15 = stp1_15; \ + stp2_11 = stp1_11; \ + stp2_12 = stp1_12; \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ + \ + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ + \ + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_4 = stp2_4; \ + stp1_7 = stp2_7; \ + \ + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + \ + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ + stp1_28) \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + \ + stp1_22 = stp2_22; \ + stp1_23 = stp2_23; \ + stp1_24 = stp2_24; \ + stp1_25 = stp2_25; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ + \ + stp2_8 = stp1_8; \ + stp2_9 = stp1_9; \ + stp2_14 = stp1_14; \ + stp2_15 = stp1_15; \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ + stp2_12) \ + \ + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ + \ + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ + } \ + \ + /* Stage7 */ \ + { \ + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ + \ + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ + \ + stp1_16 = stp2_16; \ + stp1_17 = stp2_17; \ + stp1_18 = stp2_18; \ + stp1_19 = stp2_19; \ + \ + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ + stp1_26) \ + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ + stp1_24) \ + \ + stp1_28 = stp2_28; \ + stp1_29 = stp2_29; \ + stp1_30 = stp2_30; \ + stp1_31 = stp2_31; \ + } + +// Only upper-left 8x8 has non-zero coeff +void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[32], col[32]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, + stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. Only need to load the top left 8x8 block. + in[0] = load_input_data(input); + in[1] = load_input_data(input + 32); + in[2] = load_input_data(input + 64); + in[3] = load_input_data(input + 96); + in[4] = load_input_data(input + 128); + in[5] = load_input_data(input + 160); + in[6] = load_input_data(input + 192); + in[7] = load_input_data(input + 224); + + for (i = 8; i < 32; ++i) { + in[i] = _mm_setzero_si128(); + } + + array_transpose_8x8(in, in); + // TODO(hkuang): Following transposes are unnecessary. But remove them will + // lead to performance drop on some devices. + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + IDCT32_34 + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[0] = _mm_add_epi16(stp1_0, stp1_31); + col[1] = _mm_add_epi16(stp1_1, stp1_30); + col[2] = _mm_add_epi16(stp1_2, stp1_29); + col[3] = _mm_add_epi16(stp1_3, stp1_28); + col[4] = _mm_add_epi16(stp1_4, stp1_27); + col[5] = _mm_add_epi16(stp1_5, stp1_26); + col[6] = _mm_add_epi16(stp1_6, stp1_25); + col[7] = _mm_add_epi16(stp1_7, stp1_24); + col[8] = _mm_add_epi16(stp1_8, stp1_23); + col[9] = _mm_add_epi16(stp1_9, stp1_22); + col[10] = _mm_add_epi16(stp1_10, stp1_21); + col[11] = _mm_add_epi16(stp1_11, stp1_20); + col[12] = _mm_add_epi16(stp1_12, stp1_19); + col[13] = _mm_add_epi16(stp1_13, stp1_18); + col[14] = _mm_add_epi16(stp1_14, stp1_17); + col[15] = _mm_add_epi16(stp1_15, stp1_16); + col[16] = _mm_sub_epi16(stp1_15, stp1_16); + col[17] = _mm_sub_epi16(stp1_14, stp1_17); + col[18] = _mm_sub_epi16(stp1_13, stp1_18); + col[19] = _mm_sub_epi16(stp1_12, stp1_19); + col[20] = _mm_sub_epi16(stp1_11, stp1_20); + col[21] = _mm_sub_epi16(stp1_10, stp1_21); + col[22] = _mm_sub_epi16(stp1_9, stp1_22); + col[23] = _mm_sub_epi16(stp1_8, stp1_23); + col[24] = _mm_sub_epi16(stp1_7, stp1_24); + col[25] = _mm_sub_epi16(stp1_6, stp1_25); + col[26] = _mm_sub_epi16(stp1_5, stp1_26); + col[27] = _mm_sub_epi16(stp1_4, stp1_27); + col[28] = _mm_sub_epi16(stp1_3, stp1_28); + col[29] = _mm_sub_epi16(stp1_2, stp1_29); + col[30] = _mm_sub_epi16(stp1_1, stp1_30); + col[31] = _mm_sub_epi16(stp1_0, stp1_31); + for (i = 0; i < 4; i++) { + int j; + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + i * 8, in); + IDCT32_34 + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in[32], col[128], zero_idx[16]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, + stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j, i32; + + for (i = 0; i < 4; i++) { + i32 = (i << 5); + // First 1-D idct + // Load input data. + LOAD_DQCOEFF(in[0], input); + LOAD_DQCOEFF(in[8], input); + LOAD_DQCOEFF(in[16], input); + LOAD_DQCOEFF(in[24], input); + LOAD_DQCOEFF(in[1], input); + LOAD_DQCOEFF(in[9], input); + LOAD_DQCOEFF(in[17], input); + LOAD_DQCOEFF(in[25], input); + LOAD_DQCOEFF(in[2], input); + LOAD_DQCOEFF(in[10], input); + LOAD_DQCOEFF(in[18], input); + LOAD_DQCOEFF(in[26], input); + LOAD_DQCOEFF(in[3], input); + LOAD_DQCOEFF(in[11], input); + LOAD_DQCOEFF(in[19], input); + LOAD_DQCOEFF(in[27], input); + + LOAD_DQCOEFF(in[4], input); + LOAD_DQCOEFF(in[12], input); + LOAD_DQCOEFF(in[20], input); + LOAD_DQCOEFF(in[28], input); + LOAD_DQCOEFF(in[5], input); + LOAD_DQCOEFF(in[13], input); + LOAD_DQCOEFF(in[21], input); + LOAD_DQCOEFF(in[29], input); + LOAD_DQCOEFF(in[6], input); + LOAD_DQCOEFF(in[14], input); + LOAD_DQCOEFF(in[22], input); + LOAD_DQCOEFF(in[30], input); + LOAD_DQCOEFF(in[7], input); + LOAD_DQCOEFF(in[15], input); + LOAD_DQCOEFF(in[23], input); + LOAD_DQCOEFF(in[31], input); + + // checking if all entries are zero + zero_idx[0] = _mm_or_si128(in[0], in[1]); + zero_idx[1] = _mm_or_si128(in[2], in[3]); + zero_idx[2] = _mm_or_si128(in[4], in[5]); + zero_idx[3] = _mm_or_si128(in[6], in[7]); + zero_idx[4] = _mm_or_si128(in[8], in[9]); + zero_idx[5] = _mm_or_si128(in[10], in[11]); + zero_idx[6] = _mm_or_si128(in[12], in[13]); + zero_idx[7] = _mm_or_si128(in[14], in[15]); + zero_idx[8] = _mm_or_si128(in[16], in[17]); + zero_idx[9] = _mm_or_si128(in[18], in[19]); + zero_idx[10] = _mm_or_si128(in[20], in[21]); + zero_idx[11] = _mm_or_si128(in[22], in[23]); + zero_idx[12] = _mm_or_si128(in[24], in[25]); + zero_idx[13] = _mm_or_si128(in[26], in[27]); + zero_idx[14] = _mm_or_si128(in[28], in[29]); + zero_idx[15] = _mm_or_si128(in[30], in[31]); + + zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); + zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); + + zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); + + if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { + col[i32 + 0] = _mm_setzero_si128(); + col[i32 + 1] = _mm_setzero_si128(); + col[i32 + 2] = _mm_setzero_si128(); + col[i32 + 3] = _mm_setzero_si128(); + col[i32 + 4] = _mm_setzero_si128(); + col[i32 + 5] = _mm_setzero_si128(); + col[i32 + 6] = _mm_setzero_si128(); + col[i32 + 7] = _mm_setzero_si128(); + col[i32 + 8] = _mm_setzero_si128(); + col[i32 + 9] = _mm_setzero_si128(); + col[i32 + 10] = _mm_setzero_si128(); + col[i32 + 11] = _mm_setzero_si128(); + col[i32 + 12] = _mm_setzero_si128(); + col[i32 + 13] = _mm_setzero_si128(); + col[i32 + 14] = _mm_setzero_si128(); + col[i32 + 15] = _mm_setzero_si128(); + col[i32 + 16] = _mm_setzero_si128(); + col[i32 + 17] = _mm_setzero_si128(); + col[i32 + 18] = _mm_setzero_si128(); + col[i32 + 19] = _mm_setzero_si128(); + col[i32 + 20] = _mm_setzero_si128(); + col[i32 + 21] = _mm_setzero_si128(); + col[i32 + 22] = _mm_setzero_si128(); + col[i32 + 23] = _mm_setzero_si128(); + col[i32 + 24] = _mm_setzero_si128(); + col[i32 + 25] = _mm_setzero_si128(); + col[i32 + 26] = _mm_setzero_si128(); + col[i32 + 27] = _mm_setzero_si128(); + col[i32 + 28] = _mm_setzero_si128(); + col[i32 + 29] = _mm_setzero_si128(); + col[i32 + 30] = _mm_setzero_si128(); + col[i32 + 31] = _mm_setzero_si128(); + continue; + } + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + IDCT32(in, in + 16) + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } + for (i = 0; i < 4; i++) { + // Second 1-D idct + j = i << 3; + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + j, in); + array_transpose_8x8(col + j + 32, in + 8); + array_transpose_8x8(col + j + 64, in + 16); + array_transpose_8x8(col + j + 96, in + 24); + + IDCT32(in, in + 16) + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, j; + + a = (int)dct_const_round_shift(input[0] * cospi_16_64); + a = (int)dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + if (a == 0) return; + + dc_value = _mm_set1_epi16(a); + + for (j = 0; j < 32; ++j) { + RECON_AND_STORE(dest + 0 + j * stride, dc_value); + RECON_AND_STORE(dest + 8 + j * stride, dc_value); + RECON_AND_STORE(dest + 16 + j * stride, dc_value); + RECON_AND_STORE(dest + 24 + j * stride, dc_value); + } +} + +// Apply a 32-element IDCT to 8 columns. This does not do any transposition +// of its input - the caller is expected to have done that. +// The input buffers are the top and bottom halves of an 8x32 block. +void idct32_8col(__m128i *in0, __m128i *in1) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, + stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + IDCT32(in0, in1) + + // 2_D: Calculate the results and store them to destination. + in0[0] = _mm_add_epi16(stp1_0, stp1_31); + in0[1] = _mm_add_epi16(stp1_1, stp1_30); + in0[2] = _mm_add_epi16(stp1_2, stp1_29); + in0[3] = _mm_add_epi16(stp1_3, stp1_28); + in0[4] = _mm_add_epi16(stp1_4, stp1_27); + in0[5] = _mm_add_epi16(stp1_5, stp1_26); + in0[6] = _mm_add_epi16(stp1_6, stp1_25); + in0[7] = _mm_add_epi16(stp1_7, stp1_24); + in0[8] = _mm_add_epi16(stp1_8, stp1_23); + in0[9] = _mm_add_epi16(stp1_9, stp1_22); + in0[10] = _mm_add_epi16(stp1_10, stp1_21); + in0[11] = _mm_add_epi16(stp1_11, stp1_20); + in0[12] = _mm_add_epi16(stp1_12, stp1_19); + in0[13] = _mm_add_epi16(stp1_13, stp1_18); + in0[14] = _mm_add_epi16(stp1_14, stp1_17); + in0[15] = _mm_add_epi16(stp1_15, stp1_16); + in1[0] = _mm_sub_epi16(stp1_15, stp1_16); + in1[1] = _mm_sub_epi16(stp1_14, stp1_17); + in1[2] = _mm_sub_epi16(stp1_13, stp1_18); + in1[3] = _mm_sub_epi16(stp1_12, stp1_19); + in1[4] = _mm_sub_epi16(stp1_11, stp1_20); + in1[5] = _mm_sub_epi16(stp1_10, stp1_21); + in1[6] = _mm_sub_epi16(stp1_9, stp1_22); + in1[7] = _mm_sub_epi16(stp1_8, stp1_23); + in1[8] = _mm_sub_epi16(stp1_7, stp1_24); + in1[9] = _mm_sub_epi16(stp1_6, stp1_25); + in1[10] = _mm_sub_epi16(stp1_5, stp1_26); + in1[11] = _mm_sub_epi16(stp1_4, stp1_27); + in1[12] = _mm_sub_epi16(stp1_3, stp1_28); + in1[13] = _mm_sub_epi16(stp1_2, stp1_29); + in1[14] = _mm_sub_epi16(stp1_1, stp1_30); + in1[15] = _mm_sub_epi16(stp1_0, stp1_31); +} + +#if CONFIG_HIGHBITDEPTH +static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { + __m128i ubounded, retval; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); + ubounded = _mm_cmpgt_epi16(value, max); + retval = _mm_andnot_si128(ubounded, value); + ubounded = _mm_and_si128(ubounded, max); + retval = _mm_or_si128(retval, ubounded); + retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); + return retval; +} + +void aom_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[4 * 4]; + tran_low_t *outptr = out; + int i, j; + __m128i inptr[4]; + __m128i sign_bits[2]; + __m128i temp_mm, min_input, max_input; + int test; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int optimised_cols = 0; + const __m128i zero = _mm_set1_epi16(0); + const __m128i eight = _mm_set1_epi16(8); + const __m128i max = _mm_set1_epi16(12043); + const __m128i min = _mm_set1_epi16(-12043); + // Load input into __m128i + inptr[0] = _mm_loadu_si128((const __m128i *)input); + inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); + inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); + inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); + + // Pack to 16 bits + inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); + inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); + + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (!test) { + // Do the row transform + aom_idct4_sse2(inptr); + + // Check the min & max values + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp_mm = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp_mm); + + if (test) { + array_transpose_4x4(inptr); + sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); + sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); + inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); + inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); + inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); + inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); + _mm_storeu_si128((__m128i *)outptr, inptr[0]); + _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); + _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); + _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + aom_highbd_idct4_c(input, outptr, bd); + input += 4; + outptr += 4; + } + } + + if (optimised_cols) { + aom_idct4_sse2(inptr); + + // Final round and shift + inptr[0] = _mm_add_epi16(inptr[0], eight); + inptr[1] = _mm_add_epi16(inptr[1], eight); + + inptr[0] = _mm_srai_epi16(inptr[0], 4); + inptr[1] = _mm_srai_epi16(inptr[1], 4); + + // Reconstruction and Store + { + __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); + __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi64( + d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); + d2 = _mm_unpacklo_epi64( + d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); + d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); + d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); + // store input0 + _mm_storel_epi64((__m128i *)dest, d0); + // store input1 + d0 = _mm_srli_si128(d0, 8); + _mm_storel_epi64((__m128i *)(dest + stride), d0); + // store input2 + _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); + // store input3 + d2 = _mm_srli_si128(d2, 8); + _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[4], temp_out[4]; + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; + aom_highbd_idct4_c(temp_in, temp_out, bd); + for (j = 0; j < 4; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); + } + } + } +} + +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h new file mode 100644 index 000000000..95d246c3c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_ +#define AOM_DSP_X86_INV_TXFM_SSE2_H_ + +#include // SSE2 +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_dsp/inv_txfm.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +// perform 8x8 transpose +static INLINE void array_transpose_4x4(__m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); + + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); +} + +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); +} + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + } + +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + } + +static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); + + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + + out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); + out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); + out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); + out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); +} + +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { + __m128i tbuf[8]; + array_transpose_8x8(res0, res0); + array_transpose_8x8(res1, tbuf); + array_transpose_8x8(res0 + 8, res1); + array_transpose_8x8(res1 + 8, res1 + 8); + + res0[8] = tbuf[0]; + res0[9] = tbuf[1]; + res0[10] = tbuf[2]; + res0[11] = tbuf[3]; + res0[12] = tbuf[4]; + res0[13] = tbuf[5]; + res0[14] = tbuf[6]; + res0[15] = tbuf[7]; +} + +// Function to allow 8 bit optimisations to be used when profile 0 is used with +// highbitdepth enabled +static INLINE __m128i load_input_data(const tran_low_t *data) { +#if CONFIG_HIGHBITDEPTH + return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], + data[6], data[7]); +#else + return _mm_load_si128((const __m128i *)data); +#endif +} + +static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { + in[0] = load_input_data(input + 0 * 16); + in[1] = load_input_data(input + 1 * 16); + in[2] = load_input_data(input + 2 * 16); + in[3] = load_input_data(input + 3 * 16); + in[4] = load_input_data(input + 4 * 16); + in[5] = load_input_data(input + 5 * 16); + in[6] = load_input_data(input + 6 * 16); + in[7] = load_input_data(input + 7 * 16); + + in[8] = load_input_data(input + 8 * 16); + in[9] = load_input_data(input + 9 * 16); + in[10] = load_input_data(input + 10 * 16); + in[11] = load_input_data(input + 11 * 16); + in[12] = load_input_data(input + 12 * 16); + in[13] = load_input_data(input + 13 * 16); + in[14] = load_input_data(input + 14 * 16); + in[15] = load_input_data(input + 15 * 16); +} + +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ + } + +static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + // Final rounding and shift + in[0] = _mm_adds_epi16(in[0], final_rounding); + in[1] = _mm_adds_epi16(in[1], final_rounding); + in[2] = _mm_adds_epi16(in[2], final_rounding); + in[3] = _mm_adds_epi16(in[3], final_rounding); + in[4] = _mm_adds_epi16(in[4], final_rounding); + in[5] = _mm_adds_epi16(in[5], final_rounding); + in[6] = _mm_adds_epi16(in[6], final_rounding); + in[7] = _mm_adds_epi16(in[7], final_rounding); + in[8] = _mm_adds_epi16(in[8], final_rounding); + in[9] = _mm_adds_epi16(in[9], final_rounding); + in[10] = _mm_adds_epi16(in[10], final_rounding); + in[11] = _mm_adds_epi16(in[11], final_rounding); + in[12] = _mm_adds_epi16(in[12], final_rounding); + in[13] = _mm_adds_epi16(in[13], final_rounding); + in[14] = _mm_adds_epi16(in[14], final_rounding); + in[15] = _mm_adds_epi16(in[15], final_rounding); + + in[0] = _mm_srai_epi16(in[0], 6); + in[1] = _mm_srai_epi16(in[1], 6); + in[2] = _mm_srai_epi16(in[2], 6); + in[3] = _mm_srai_epi16(in[3], 6); + in[4] = _mm_srai_epi16(in[4], 6); + in[5] = _mm_srai_epi16(in[5], 6); + in[6] = _mm_srai_epi16(in[6], 6); + in[7] = _mm_srai_epi16(in[7], 6); + in[8] = _mm_srai_epi16(in[8], 6); + in[9] = _mm_srai_epi16(in[9], 6); + in[10] = _mm_srai_epi16(in[10], 6); + in[11] = _mm_srai_epi16(in[11], 6); + in[12] = _mm_srai_epi16(in[12], 6); + in[13] = _mm_srai_epi16(in[13], 6); + in[14] = _mm_srai_epi16(in[14], 6); + in[15] = _mm_srai_epi16(in[15], 6); + + RECON_AND_STORE(dest + 0 * stride, in[0]); + RECON_AND_STORE(dest + 1 * stride, in[1]); + RECON_AND_STORE(dest + 2 * stride, in[2]); + RECON_AND_STORE(dest + 3 * stride, in[3]); + RECON_AND_STORE(dest + 4 * stride, in[4]); + RECON_AND_STORE(dest + 5 * stride, in[5]); + RECON_AND_STORE(dest + 6 * stride, in[6]); + RECON_AND_STORE(dest + 7 * stride, in[7]); + RECON_AND_STORE(dest + 8 * stride, in[8]); + RECON_AND_STORE(dest + 9 * stride, in[9]); + RECON_AND_STORE(dest + 10 * stride, in[10]); + RECON_AND_STORE(dest + 11 * stride, in[11]); + RECON_AND_STORE(dest + 12 * stride, in[12]); + RECON_AND_STORE(dest + 13 * stride, in[13]); + RECON_AND_STORE(dest + 14 * stride, in[14]); + RECON_AND_STORE(dest + 15 * stride, in[15]); +} + +#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ + const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + } + +#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + } + +void iadst16_8col(__m128i *in); +void idct16_8col(__m128i *in); +void aom_idct4_sse2(__m128i *in); +void aom_idct8_sse2(__m128i *in); +void aom_idct16_sse2(__m128i *in0, __m128i *in1); +void aom_iadst4_sse2(__m128i *in); +void aom_iadst8_sse2(__m128i *in); +void aom_iadst16_sse2(__m128i *in0, __m128i *in1); +void idct32_8col(__m128i *in0, __m128i *in1); + +#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c new file mode 100644 index 000000000..9d006797b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c @@ -0,0 +1,1333 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/inv_txfm_sse2.h" +#include "aom_dsp/x86/txfm_common_sse2.h" + +void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + in4 = load_input_data(input + 8 * 4); + in5 = load_input_data(input + 8 * 5); + in6 = load_input_data(input + 8 * 6); + in7 = load_input_data(input + 8 * 7); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from vpx_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + { + /* Stage1 */ + { + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); + + { + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp1 = _mm_madd_epi16(hi_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp3 = _mm_madd_epi16(hi_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp5 = _mm_madd_epi16(hi_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + tmp7 = _mm_madd_epi16(hi_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + tmp4 = _mm_srai_epi32(tmp4, 14); + tmp5 = _mm_srai_epi32(tmp5, 14); + tmp6 = _mm_srai_epi32(tmp6, 14); + tmp7 = _mm_srai_epi32(tmp7, 14); + + stp1_4 = _mm_packs_epi32(tmp0, tmp1); + stp1_7 = _mm_packs_epi32(tmp2, tmp3); + stp1_5 = _mm_packs_epi32(tmp4, tmp5); + stp1_6 = _mm_packs_epi32(tmp6, tmp7); + } + } + + /* Stage2 */ + { + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); + + { + tmp0 = _mm_unpacklo_epi16(in0, in4); + tmp1 = _mm_unpackhi_epi16(in0, in4); + + tmp2 = _mm_madd_epi16(tmp0, stk2_0); + tmp3 = _mm_madd_epi16(tmp1, stk2_0); + tmp4 = _mm_madd_epi16(tmp0, stk2_1); + tmp5 = _mm_madd_epi16(tmp1, stk2_1); + + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp2, tmp3); + stp2_1 = _mm_packs_epi32(tmp4, tmp5); + + tmp0 = _mm_madd_epi16(lo_26, stg2_2); + tmp1 = _mm_madd_epi16(hi_26, stg2_2); + tmp2 = _mm_madd_epi16(lo_26, stg2_3); + tmp3 = _mm_madd_epi16(hi_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + + stp2_2 = _mm_packs_epi32(tmp0, tmp1); + stp2_3 = _mm_packs_epi32(tmp2, tmp3); + } + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + } + + /* Stage3 */ + { + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); + tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); + + tmp2 = _mm_madd_epi16(tmp0, stk2_1); + tmp3 = _mm_madd_epi16(tmp1, stk2_1); + tmp4 = _mm_madd_epi16(tmp0, stk2_0); + tmp5 = _mm_madd_epi16(tmp1, stk2_0); + + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp2, tmp3); + stp1_6 = _mm_packs_epi32(tmp4, tmp5); + } + + /* Stage4 */ + in0 = _mm_add_epi16(stp1_0, stp2_7); + in1 = _mm_add_epi16(stp1_1, stp1_6); + in2 = _mm_add_epi16(stp1_2, stp1_5); + in3 = _mm_add_epi16(stp1_3, stp2_4); + in4 = _mm_sub_epi16(stp1_3, stp2_4); + in5 = _mm_sub_epi16(stp1_2, stp1_5); + in6 = _mm_sub_epi16(stp1_1, stp1_6); + in7 = _mm_sub_epi16(stp1_0, stp2_7); + } + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); + const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); + const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); + const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3; + + // Rows. Load 4-row input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + + // 8x4 Transpose + TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); + + // Stage1 + tmp0 = _mm_mulhrs_epi16(in0, stg1_0); + tmp1 = _mm_mulhrs_epi16(in0, stg1_1); + tmp2 = _mm_mulhrs_epi16(in1, stg1_2); + tmp3 = _mm_mulhrs_epi16(in1, stg1_3); + + stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1); + stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3); + + // Stage2 + tmp0 = _mm_mulhrs_epi16(in0, stg2_0); + stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0); + + tmp1 = _mm_mulhrs_epi16(in1, stg2_2); + tmp2 = _mm_mulhrs_epi16(in1, stg2_3); + stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1); + + tmp0 = _mm_add_epi16(stp1_4, stp1_5); + tmp1 = _mm_sub_epi16(stp1_4, stp1_5); + + stp2_4 = tmp0; + stp2_5 = _mm_unpacklo_epi64(tmp1, zero); + stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + + tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6); + tmp1 = _mm_madd_epi16(tmp0, stg3_0); + tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0 + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp1, tmp2); + + // Stage3 + tmp2 = _mm_add_epi16(stp2_0, stp2_2); + tmp3 = _mm_sub_epi16(stp2_0, stp2_2); + + stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2); + stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2); + + // Stage4 + tmp0 = _mm_add_epi16(stp1_3, stp2_4); + tmp1 = _mm_add_epi16(stp1_2, stp1_5); + tmp2 = _mm_sub_epi16(stp1_3, stp2_4); + tmp3 = _mm_sub_epi16(stp1_2, stp1_5); + + TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + + /* Stage1 */ + stp1_4 = _mm_mulhrs_epi16(in1, stg1_0); + stp1_7 = _mm_mulhrs_epi16(in1, stg1_1); + stp1_5 = _mm_mulhrs_epi16(in3, stg1_2); + stp1_6 = _mm_mulhrs_epi16(in3, stg1_3); + + /* Stage2 */ + stp2_0 = _mm_mulhrs_epi16(in0, stg2_0); + stp2_1 = _mm_mulhrs_epi16(in0, stg2_0); + + stp2_2 = _mm_mulhrs_epi16(in2, stg2_2); + stp2_3 = _mm_mulhrs_epi16(in2, stg2_3); + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + + /* Stage3 */ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); + tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); + + tmp2 = _mm_madd_epi16(tmp0, stk2_0); + tmp3 = _mm_madd_epi16(tmp1, stk2_0); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + stp1_6 = _mm_packs_epi32(tmp2, tmp3); + + tmp2 = _mm_madd_epi16(tmp0, stk2_1); + tmp3 = _mm_madd_epi16(tmp1, stk2_1); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + stp1_5 = _mm_packs_epi32(tmp2, tmp3); + + /* Stage4 */ + in0 = _mm_add_epi16(stp1_0, stp2_7); + in1 = _mm_add_epi16(stp1_1, stp1_6); + in2 = _mm_add_epi16(stp1_2, stp1_5); + in3 = _mm_add_epi16(stp1_3, stp2_4); + in4 = _mm_sub_epi16(stp1_3, stp2_4); + in5 = _mm_sub_epi16(stp1_2, stp1_5); + in6 = _mm_sub_epi16(stp1_1, stp1_6); + in7 = _mm_sub_epi16(stp1_0, stp2_7); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +// Only do addition and subtraction butterfly, size = 16, 32 +static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ + do { \ + tmp0 = _mm_madd_epi16(x0, co0); \ + tmp1 = _mm_madd_epi16(x1, co0); \ + tmp2 = _mm_madd_epi16(x0, co1); \ + tmp3 = _mm_madd_epi16(x1, co1); \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + } while (0) + +static INLINE void butterfly(const __m128i *x0, const __m128i *x1, + const __m128i *c0, const __m128i *c1, __m128i *y0, + __m128i *y1) { + __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + u0 = _mm_unpacklo_epi16(*x0, *x1); + u1 = _mm_unpackhi_epi16(*x0, *x1); + BUTTERFLY_PAIR(u0, u1, *c0, *c1); + *y0 = _mm_packs_epi32(tmp0, tmp1); + *y1 = _mm_packs_epi32(tmp2, tmp3); +} + +static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, + const __m128i *c1) { + __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + + u0 = _mm_unpacklo_epi16(*x0, *x1); + u1 = _mm_unpackhi_epi16(*x0, *x1); + BUTTERFLY_PAIR(u0, u1, *c0, *c1); + *x0 = _mm_packs_epi32(tmp0, tmp1); + *x1 = _mm_packs_epi32(tmp2, tmp3); +} + +static void idct32_34_first_half(const __m128i *in, __m128i *stp1) { + const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); + const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); + const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); + const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); + + const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i x0, x1, x4, x5, x6, x7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + + // phase 1 + + // 0, 15 + u2 = _mm_mulhrs_epi16(in[2], stk2_1); // stp2_15 + u3 = _mm_mulhrs_epi16(in[6], stk2_7); // stp2_12 + v15 = _mm_add_epi16(u2, u3); + // in[0], in[4] + x0 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[0] + x7 = _mm_mulhrs_epi16(in[4], stk3_1); // stp1[7] + v0 = _mm_add_epi16(x0, x7); // stp2_0 + stp1[0] = _mm_add_epi16(v0, v15); + stp1[15] = _mm_sub_epi16(v0, v15); + + // in[2], in[6] + u0 = _mm_mulhrs_epi16(in[2], stk2_0); // stp2_8 + u1 = _mm_mulhrs_epi16(in[6], stk2_6); // stp2_11 + butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14 + butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13 + + v8 = _mm_add_epi16(u0, u1); + v9 = _mm_add_epi16(u4, u6); + v10 = _mm_sub_epi16(u4, u6); + v11 = _mm_sub_epi16(u0, u1); + v12 = _mm_sub_epi16(u2, u3); + v13 = _mm_sub_epi16(u5, u7); + v14 = _mm_add_epi16(u5, u7); + + butterfly_self(&v10, &v13, &stg6_0, &stg4_0); + butterfly_self(&v11, &v12, &stg6_0, &stg4_0); + + // 1, 14 + x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0 + // stp1[2] = stp1[0], stp1[3] = stp1[1] + x4 = _mm_mulhrs_epi16(in[4], stk3_0); // stp1[4] + butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6); + v1 = _mm_add_epi16(x1, x6); // stp2_1 + v2 = _mm_add_epi16(x0, x5); // stp2_2 + stp1[1] = _mm_add_epi16(v1, v14); + stp1[14] = _mm_sub_epi16(v1, v14); + + stp1[2] = _mm_add_epi16(v2, v13); + stp1[13] = _mm_sub_epi16(v2, v13); + + v3 = _mm_add_epi16(x1, x4); // stp2_3 + v4 = _mm_sub_epi16(x1, x4); // stp2_4 + + v5 = _mm_sub_epi16(x0, x5); // stp2_5 + + v6 = _mm_sub_epi16(x1, x6); // stp2_6 + v7 = _mm_sub_epi16(x0, x7); // stp2_7 + stp1[3] = _mm_add_epi16(v3, v12); + stp1[12] = _mm_sub_epi16(v3, v12); + + stp1[6] = _mm_add_epi16(v6, v9); + stp1[9] = _mm_sub_epi16(v6, v9); + + stp1[7] = _mm_add_epi16(v7, v8); + stp1[8] = _mm_sub_epi16(v7, v8); + + stp1[4] = _mm_add_epi16(v4, v11); + stp1[11] = _mm_sub_epi16(v4, v11); + + stp1[5] = _mm_add_epi16(v5, v10); + stp1[10] = _mm_sub_epi16(v5, v10); +} + +static void idct32_34_second_half(const __m128i *in, __m128i *stp1) { + const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); + const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); + const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); + const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); + const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); + const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); + const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); + const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i v16, v17, v18, v19, v20, v21, v22, v23; + __m128i v24, v25, v26, v27, v28, v29, v30, v31; + __m128i u16, u17, u18, u19, u20, u21, u22, u23; + __m128i u24, u25, u26, u27, u28, u29, u30, u31; + + v16 = _mm_mulhrs_epi16(in[1], stk1_0); + v31 = _mm_mulhrs_epi16(in[1], stk1_1); + + v19 = _mm_mulhrs_epi16(in[7], stk1_6); + v28 = _mm_mulhrs_epi16(in[7], stk1_7); + + v20 = _mm_mulhrs_epi16(in[5], stk1_8); + v27 = _mm_mulhrs_epi16(in[5], stk1_9); + + v23 = _mm_mulhrs_epi16(in[3], stk1_14); + v24 = _mm_mulhrs_epi16(in[3], stk1_15); + + butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30); + butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29); + butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26); + butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25); + + u16 = _mm_add_epi16(v16, v19); + u17 = _mm_add_epi16(v17, v18); + u18 = _mm_sub_epi16(v17, v18); + u19 = _mm_sub_epi16(v16, v19); + u20 = _mm_sub_epi16(v23, v20); + u21 = _mm_sub_epi16(v22, v21); + u22 = _mm_add_epi16(v22, v21); + u23 = _mm_add_epi16(v23, v20); + u24 = _mm_add_epi16(v24, v27); + u27 = _mm_sub_epi16(v24, v27); + u25 = _mm_add_epi16(v25, v26); + u26 = _mm_sub_epi16(v25, v26); + u28 = _mm_sub_epi16(v31, v28); + u31 = _mm_add_epi16(v28, v31); + u29 = _mm_sub_epi16(v30, v29); + u30 = _mm_add_epi16(v29, v30); + + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); + + stp1[16] = _mm_add_epi16(u16, u23); + stp1[23] = _mm_sub_epi16(u16, u23); + + stp1[17] = _mm_add_epi16(u17, u22); + stp1[22] = _mm_sub_epi16(u17, u22); + + stp1[18] = _mm_add_epi16(u18, u21); + stp1[21] = _mm_sub_epi16(u18, u21); + + stp1[19] = _mm_add_epi16(u19, u20); + stp1[20] = _mm_sub_epi16(u19, u20); + + stp1[24] = _mm_sub_epi16(u31, u24); + stp1[31] = _mm_add_epi16(u24, u31); + + stp1[25] = _mm_sub_epi16(u30, u25); + stp1[30] = _mm_add_epi16(u25, u30); + + stp1[26] = _mm_sub_epi16(u29, u26); + stp1[29] = _mm_add_epi16(u26, u29); + + stp1[27] = _mm_sub_epi16(u28, u27); + stp1[28] = _mm_add_epi16(u27, u28); + + butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0); + butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0); + butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0); + butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0); +} + +// Only upper-left 8x8 has non-zero coeff +void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i in[32], col[32]; + __m128i stp1[32]; + int i; + + // Load input data. Only need to load the top left 8x8 block. + in[0] = load_input_data(input); + in[1] = load_input_data(input + 32); + in[2] = load_input_data(input + 64); + in[3] = load_input_data(input + 96); + in[4] = load_input_data(input + 128); + in[5] = load_input_data(input + 160); + in[6] = load_input_data(input + 192); + in[7] = load_input_data(input + 224); + + array_transpose_8x8(in, in); + idct32_34_first_half(in, stp1); + idct32_34_second_half(in, stp1); + + // 1_D: Store 32 intermediate results for each 8x32 block. + add_sub_butterfly(stp1, col, 32); + for (i = 0; i < 4; i++) { + int j; + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + i * 8, in); + idct32_34_first_half(in, stp1); + idct32_34_second_half(in, stp1); + + // 2_D: Calculate the results and store them to destination. + add_sub_butterfly(stp1, in, 32); + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +// in0[16] represents the left 8x16 block +// in1[16] represents the right 8x16 block +static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, + __m128i *in1) { + int i; + for (i = 0; i < 16; i++) { + in0[i] = load_input_data(input); + in1[i] = load_input_data(input + 8); + input += 32; + } +} + +static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0, + __m128i *out1) { + array_transpose_8x8(in0, out0); + array_transpose_8x8(&in0[8], out1); + array_transpose_8x8(in1, &out0[8]); + array_transpose_8x8(&in1[8], &out1[8]); +} + +// Group the coefficient calculation into smaller functions +// to prevent stack spillover: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 +static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/, + __m128i *out /*out[8]*/) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + + { + const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); + const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); + u0 = _mm_mulhrs_epi16(in[0], stk4_0); + u2 = _mm_mulhrs_epi16(in[8], stk4_2); + u3 = _mm_mulhrs_epi16(in[8], stk4_3); + u1 = u0; + } + + v0 = _mm_add_epi16(u0, u3); + v1 = _mm_add_epi16(u1, u2); + v2 = _mm_sub_epi16(u1, u2); + v3 = _mm_sub_epi16(u0, u3); + + { + const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); + const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); + u4 = _mm_mulhrs_epi16(in[4], stk3_0); + u7 = _mm_mulhrs_epi16(in[4], stk3_1); + u5 = _mm_mulhrs_epi16(in[12], stk3_2); + u6 = _mm_mulhrs_epi16(in[12], stk3_3); + } + + v4 = _mm_add_epi16(u4, u5); + v5 = _mm_sub_epi16(u4, u5); + v6 = _mm_sub_epi16(u7, u6); + v7 = _mm_add_epi16(u7, u6); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); + } + + out[0] = _mm_add_epi16(v0, v7); + out[1] = _mm_add_epi16(v1, v6); + out[2] = _mm_add_epi16(v2, v5); + out[3] = _mm_add_epi16(v3, v4); + out[4] = _mm_sub_epi16(v3, v4); + out[5] = _mm_sub_epi16(v2, v5); + out[6] = _mm_sub_epi16(v1, v6); + out[7] = _mm_sub_epi16(v0, v7); +} + +static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/, + __m128i *out /*out[8]*/) { + __m128i u8, u9, u10, u11, u12, u13, u14, u15; + __m128i v8, v9, v10, v11, v12, v13, v14, v15; + + { + const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); + const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); + const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); + const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); + const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); + const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); + const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); + const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); + u8 = _mm_mulhrs_epi16(in[2], stk2_0); + u15 = _mm_mulhrs_epi16(in[2], stk2_1); + u9 = _mm_mulhrs_epi16(in[14], stk2_2); + u14 = _mm_mulhrs_epi16(in[14], stk2_3); + u10 = _mm_mulhrs_epi16(in[10], stk2_4); + u13 = _mm_mulhrs_epi16(in[10], stk2_5); + u11 = _mm_mulhrs_epi16(in[6], stk2_6); + u12 = _mm_mulhrs_epi16(in[6], stk2_7); + } + + v8 = _mm_add_epi16(u8, u9); + v9 = _mm_sub_epi16(u8, u9); + v10 = _mm_sub_epi16(u11, u10); + v11 = _mm_add_epi16(u11, u10); + v12 = _mm_add_epi16(u12, u13); + v13 = _mm_sub_epi16(u12, u13); + v14 = _mm_sub_epi16(u15, u14); + v15 = _mm_add_epi16(u15, u14); + + { + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&v9, &v14, &stg4_4, &stg4_5); + butterfly_self(&v10, &v13, &stg4_6, &stg4_4); + } + + out[0] = _mm_add_epi16(v8, v11); + out[1] = _mm_add_epi16(v9, v10); + out[2] = _mm_sub_epi16(v9, v10); + out[3] = _mm_sub_epi16(v8, v11); + out[4] = _mm_sub_epi16(v15, v12); + out[5] = _mm_sub_epi16(v14, v13); + out[6] = _mm_add_epi16(v14, v13); + out[7] = _mm_add_epi16(v15, v12); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); + butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); + } +} + +// 8x32 block even indexed 8 inputs of in[16], +// output first half 16 to out[32] +static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/, + __m128i *out /*out[32]*/) { + __m128i temp[16]; + idct32_8x32_135_quarter_1(in, temp); + idct32_8x32_135_quarter_2(in, &temp[8]); + add_sub_butterfly(temp, out, 16); +} + +// 8x32 block odd indexed 8 inputs of in[16], +// output second half 16 to out[32] +static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/, + __m128i *out /*out[32]*/) { + __m128i v16, v17, v18, v19, v20, v21, v22, v23; + __m128i v24, v25, v26, v27, v28, v29, v30, v31; + __m128i u16, u17, u18, u19, u20, u21, u22, u23; + __m128i u24, u25, u26, u27, u28, u29, u30, u31; + + { + const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); + const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); + const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64); + const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64); + + const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64); + const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64); + const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); + const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); + const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); + const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); + const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64); + const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64); + + const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64); + const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64); + const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); + const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); + u16 = _mm_mulhrs_epi16(in[1], stk1_0); + u31 = _mm_mulhrs_epi16(in[1], stk1_1); + u17 = _mm_mulhrs_epi16(in[15], stk1_2); + u30 = _mm_mulhrs_epi16(in[15], stk1_3); + + u18 = _mm_mulhrs_epi16(in[9], stk1_4); + u29 = _mm_mulhrs_epi16(in[9], stk1_5); + u19 = _mm_mulhrs_epi16(in[7], stk1_6); + u28 = _mm_mulhrs_epi16(in[7], stk1_7); + + u20 = _mm_mulhrs_epi16(in[5], stk1_8); + u27 = _mm_mulhrs_epi16(in[5], stk1_9); + u21 = _mm_mulhrs_epi16(in[11], stk1_10); + u26 = _mm_mulhrs_epi16(in[11], stk1_11); + + u22 = _mm_mulhrs_epi16(in[13], stk1_12); + u25 = _mm_mulhrs_epi16(in[13], stk1_13); + u23 = _mm_mulhrs_epi16(in[3], stk1_14); + u24 = _mm_mulhrs_epi16(in[3], stk1_15); + } + + v16 = _mm_add_epi16(u16, u17); + v17 = _mm_sub_epi16(u16, u17); + v18 = _mm_sub_epi16(u19, u18); + v19 = _mm_add_epi16(u19, u18); + + v20 = _mm_add_epi16(u20, u21); + v21 = _mm_sub_epi16(u20, u21); + v22 = _mm_sub_epi16(u23, u22); + v23 = _mm_add_epi16(u23, u22); + + v24 = _mm_add_epi16(u24, u25); + v25 = _mm_sub_epi16(u24, u25); + v26 = _mm_sub_epi16(u27, u26); + v27 = _mm_add_epi16(u27, u26); + + v28 = _mm_add_epi16(u28, u29); + v29 = _mm_sub_epi16(u28, u29); + v30 = _mm_sub_epi16(u31, u30); + v31 = _mm_add_epi16(u31, u30); + + { + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + butterfly_self(&v17, &v30, &stg3_4, &stg3_5); + butterfly_self(&v18, &v29, &stg3_6, &stg3_4); + butterfly_self(&v21, &v26, &stg3_8, &stg3_9); + butterfly_self(&v22, &v25, &stg3_10, &stg3_8); + } + + u16 = _mm_add_epi16(v16, v19); + u17 = _mm_add_epi16(v17, v18); + u18 = _mm_sub_epi16(v17, v18); + u19 = _mm_sub_epi16(v16, v19); + u20 = _mm_sub_epi16(v23, v20); + u21 = _mm_sub_epi16(v22, v21); + u22 = _mm_add_epi16(v22, v21); + u23 = _mm_add_epi16(v23, v20); + + u24 = _mm_add_epi16(v24, v27); + u25 = _mm_add_epi16(v25, v26); + u26 = _mm_sub_epi16(v25, v26); + u27 = _mm_sub_epi16(v24, v27); + u28 = _mm_sub_epi16(v31, v28); + u29 = _mm_sub_epi16(v30, v29); + u30 = _mm_add_epi16(v29, v30); + u31 = _mm_add_epi16(v28, v31); + + { + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); + } + + out[0] = _mm_add_epi16(u16, u23); + out[1] = _mm_add_epi16(u17, u22); + out[2] = _mm_add_epi16(u18, u21); + out[3] = _mm_add_epi16(u19, u20); + v20 = _mm_sub_epi16(u19, u20); + v21 = _mm_sub_epi16(u18, u21); + v22 = _mm_sub_epi16(u17, u22); + v23 = _mm_sub_epi16(u16, u23); + + v24 = _mm_sub_epi16(u31, u24); + v25 = _mm_sub_epi16(u30, u25); + v26 = _mm_sub_epi16(u29, u26); + v27 = _mm_sub_epi16(u28, u27); + out[12] = _mm_add_epi16(u27, u28); + out[13] = _mm_add_epi16(u26, u29); + out[14] = _mm_add_epi16(u25, u30); + out[15] = _mm_add_epi16(u24, u31); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]); + butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]); + butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]); + butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]); + } +} + +// 8x16 block, input __m128i in[16], output __m128i in[32] +static void idct32_8x32_135(__m128i *in /*in[32]*/) { + __m128i out[32]; + idct32_8x32_quarter_1_2(in, out); + idct32_8x32_quarter_3_4(in, &out[16]); + add_sub_butterfly(out, in, 32); +} + +static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + int j = 0; + while (j < 32) { + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm_srai_epi16(in[j], 6); + in[j + 1] = _mm_srai_epi16(in[j + 1], 6); + + RECON_AND_STORE(dst, in[j]); + dst += stride; + RECON_AND_STORE(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest, + int stride) { + store_buffer_8x32(in0, dest, stride); + store_buffer_8x32(in1, dest + 8, stride); +} + +static INLINE void idct32_135(__m128i *col0, __m128i *col1) { + idct32_8x32_135(col0); + idct32_8x32_135(col1); +} + +typedef enum { left_16, right_16 } ColsIndicator; + +static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store, + ColsIndicator cols) { + switch (cols) { + case left_16: { + int i; + array_transpose_16x16(in0, in1); + for (i = 0; i < 16; ++i) { + store[i] = in0[16 + i]; + store[16 + i] = in1[16 + i]; + } + break; + } + case right_16: { + array_transpose_16x16_2(store, &store[16], in0, in1); + break; + } + default: { assert(0); } + } +} + +// Only upper-left 16x16 has non-zero coeff +void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + // Each array represents an 8x32 block + __m128i col0[32], col1[32]; + // This array represents a 16x16 block + __m128i temp[32]; + + // Load input data. Only need to load the top left 16x16 block. + load_buffer_16x16(input, col0, col1); + + // columns + array_transpose_16x16(col0, col1); + idct32_135(col0, col1); + + // rows + transpose_and_copy_16x16(col0, col1, temp, left_16); + idct32_135(col0, col1); + recon_and_store(col0, col1, dest, stride); + + transpose_and_copy_16x16(col0, col1, temp, right_16); + idct32_135(col0, col1); + recon_and_store(col0, col1, dest + 16, stride); +} + +// For each 8x32 block __m128i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m128i in[32] +static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ + __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ + + { + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); + butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); + } + + v8 = _mm_add_epi16(u8, u9); + v9 = _mm_sub_epi16(u8, u9); + v14 = _mm_sub_epi16(u15, u14); + v15 = _mm_add_epi16(u15, u14); + + { + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); + butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); + } + + v10 = _mm_sub_epi16(u11, u10); + v11 = _mm_add_epi16(u11, u10); + v12 = _mm_add_epi16(u12, u13); + v13 = _mm_sub_epi16(u12, u13); + + { + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&v9, &v14, &stg4_4, &stg4_5); + butterfly_self(&v10, &v13, &stg4_6, &stg4_4); + } + + out[0] = _mm_add_epi16(v8, v11); + out[1] = _mm_add_epi16(v9, v10); + out[6] = _mm_add_epi16(v14, v13); + out[7] = _mm_add_epi16(v15, v12); + + out[2] = _mm_sub_epi16(v9, v10); + out[3] = _mm_sub_epi16(v8, v11); + out[4] = _mm_sub_epi16(v15, v12); + out[5] = _mm_sub_epi16(v14, v13); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); + butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); + } +} + +// For each 8x32 block __m128i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m128i in[32] +static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/, + __m128i *out /*out[8]*/) { + __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ + __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ + + { + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); + butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); + } + + v4 = _mm_add_epi16(u4, u5); + v5 = _mm_sub_epi16(u4, u5); + v6 = _mm_sub_epi16(u7, u6); + v7 = _mm_add_epi16(u7, u6); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); + + butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); + butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); + } + + v0 = _mm_add_epi16(u0, u3); + v1 = _mm_add_epi16(u1, u2); + v2 = _mm_sub_epi16(u1, u2); + v3 = _mm_sub_epi16(u0, u3); + + out[0] = _mm_add_epi16(v0, v7); + out[1] = _mm_add_epi16(v1, v6); + out[2] = _mm_add_epi16(v2, v5); + out[3] = _mm_add_epi16(v3, v4); + out[4] = _mm_sub_epi16(v3, v4); + out[5] = _mm_sub_epi16(v2, v5); + out[6] = _mm_sub_epi16(v1, v6); + out[7] = _mm_sub_epi16(v0, v7); +} + +// For each 8x32 block __m128i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m128i in[32] +// We avoid hide an offset, 16, inside this function. So we output 0-15 into +// array out[16] +static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, + __m128i *out /*out[16]*/) { + __m128i v16, v17, v18, v19, v20, v21, v22, v23; + __m128i v24, v25, v26, v27, v28, v29, v30, v31; + __m128i u16, u17, u18, u19, u20, u21, u22, u23; + __m128i u24, u25, u26, u27, u28, u29, u30, u31; + + { + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); + butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); + butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); + butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); + + butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); + butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); + + butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); + butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); + } + + v16 = _mm_add_epi16(u16, u17); + v17 = _mm_sub_epi16(u16, u17); + v18 = _mm_sub_epi16(u19, u18); + v19 = _mm_add_epi16(u19, u18); + + v20 = _mm_add_epi16(u20, u21); + v21 = _mm_sub_epi16(u20, u21); + v22 = _mm_sub_epi16(u23, u22); + v23 = _mm_add_epi16(u23, u22); + + v24 = _mm_add_epi16(u24, u25); + v25 = _mm_sub_epi16(u24, u25); + v26 = _mm_sub_epi16(u27, u26); + v27 = _mm_add_epi16(u27, u26); + + v28 = _mm_add_epi16(u28, u29); + v29 = _mm_sub_epi16(u28, u29); + v30 = _mm_sub_epi16(u31, u30); + v31 = _mm_add_epi16(u31, u30); + + { + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + butterfly_self(&v17, &v30, &stg3_4, &stg3_5); + butterfly_self(&v18, &v29, &stg3_6, &stg3_4); + butterfly_self(&v21, &v26, &stg3_8, &stg3_9); + butterfly_self(&v22, &v25, &stg3_10, &stg3_8); + } + + u16 = _mm_add_epi16(v16, v19); + u17 = _mm_add_epi16(v17, v18); + u18 = _mm_sub_epi16(v17, v18); + u19 = _mm_sub_epi16(v16, v19); + u20 = _mm_sub_epi16(v23, v20); + u21 = _mm_sub_epi16(v22, v21); + u22 = _mm_add_epi16(v22, v21); + u23 = _mm_add_epi16(v23, v20); + + u24 = _mm_add_epi16(v24, v27); + u25 = _mm_add_epi16(v25, v26); + u26 = _mm_sub_epi16(v25, v26); + u27 = _mm_sub_epi16(v24, v27); + + u28 = _mm_sub_epi16(v31, v28); + u29 = _mm_sub_epi16(v30, v29); + u30 = _mm_add_epi16(v29, v30); + u31 = _mm_add_epi16(v28, v31); + + { + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); + } + + out[0] = _mm_add_epi16(u16, u23); + out[1] = _mm_add_epi16(u17, u22); + out[2] = _mm_add_epi16(u18, u21); + out[3] = _mm_add_epi16(u19, u20); + out[4] = _mm_sub_epi16(u19, u20); + out[5] = _mm_sub_epi16(u18, u21); + out[6] = _mm_sub_epi16(u17, u22); + out[7] = _mm_sub_epi16(u16, u23); + + out[8] = _mm_sub_epi16(u31, u24); + out[9] = _mm_sub_epi16(u30, u25); + out[10] = _mm_sub_epi16(u29, u26); + out[11] = _mm_sub_epi16(u28, u27); + out[12] = _mm_add_epi16(u27, u28); + out[13] = _mm_add_epi16(u26, u29); + out[14] = _mm_add_epi16(u25, u30); + out[15] = _mm_add_epi16(u24, u31); + + { + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); + butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); + butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); + butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); + } +} + +static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/, + __m128i *out /*out[32]*/) { + __m128i temp[16]; + idct32_full_8x32_quarter_1(in, temp); + idct32_full_8x32_quarter_2(in, &temp[8]); + add_sub_butterfly(temp, out, 16); +} + +static void idct32_full_8x32(const __m128i *in /*in[32]*/, + __m128i *out /*out[32]*/) { + __m128i temp[32]; + idct32_full_8x32_quarter_1_2(in, temp); + idct32_full_8x32_quarter_3_4(in, &temp[16]); + add_sub_butterfly(temp, out, 32); +} + +static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { + int i; + for (i = 0; i < 8; ++i) { + in[i] = load_input_data(input); + in[i + 8] = load_input_data(input + 8); + in[i + 16] = load_input_data(input + 16); + in[i + 24] = load_input_data(input + 24); + input += 32; + } +} + +void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[128], in[32]; + int i, j; + + // rows + for (i = 0; i < 4; ++i) { + load_buffer_8x32(input, in); + input += 32 << 3; + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + idct32_full_8x32(in, col + (i << 5)); + } + + // columns + for (i = 0; i < 4; ++i) { + j = i << 3; + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + j, in); + array_transpose_8x8(col + j + 32, in + 8); + array_transpose_8x8(col + j + 64, in + 16); + array_transpose_8x8(col + j + 96, in + 24); + + idct32_full_8x32(in, in); + store_buffer_8x32(in, dest, stride); + dest += 8; + } +} diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm new file mode 100644 index 000000000..f0668e6f3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm @@ -0,0 +1,112 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro REORDER_INPUTS 0 + ; a c d b to a b c d + SWAP 1, 3, 2 +%endmacro + +%macro TRANSFORM_COLS 0 + ; input: + ; m0 a + ; m1 b + ; m2 c + ; m3 d + paddw m0, m2 + psubw m3, m1 + + ; wide subtract + punpcklwd m4, m0 + punpcklwd m5, m3 + psrad m4, 16 + psrad m5, 16 + psubd m4, m5 + psrad m4, 1 + packssdw m4, m4 ; e + + psubw m5, m4, m1 ; b + psubw m4, m2 ; c + psubw m0, m5 + paddw m3, m4 + ; m0 a + SWAP 1, 5 ; m1 b + SWAP 2, 4 ; m2 c + ; m3 d +%endmacro + +%macro TRANSPOSE_4X4 0 + punpcklwd m0, m2 + punpcklwd m1, m3 + mova m2, m0 + punpcklwd m0, m1 + punpckhwd m2, m1 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 +%macro TRANSPOSE_4X4_WIDE 0 + mova m3, m0 + punpcklwd m0, m1 + punpckhwd m3, m1 + mova m2, m0 + punpcklwd m0, m3 + punpckhwd m2, m3 + pshufd m1, m0, 0x0e + pshufd m3, m2, 0x0e +%endmacro + +%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero + movd m%3, [outputq] + movd m%4, [outputq + strideq] + punpcklbw m%3, m%5 + punpcklbw m%4, m%5 + paddw m%1, m%3 + paddw m%2, m%4 + packuswb m%1, m%5 + packuswb m%2, m%5 + movd [outputq], m%1 + movd [outputq + strideq], m%2 +%endmacro + +INIT_XMM sse2 +cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride +%if CONFIG_HIGHBITDEPTH + mova m0, [inputq + 0] + packssdw m0, [inputq + 16] + mova m1, [inputq + 32] + packssdw m1, [inputq + 48] +%else + mova m0, [inputq + 0] + mova m1, [inputq + 16] +%endif + psraw m0, 2 + psraw m1, 2 + + TRANSPOSE_4X4_WIDE + REORDER_INPUTS + TRANSFORM_COLS + TRANSPOSE_4X4 + REORDER_INPUTS + TRANSFORM_COLS + + pxor m4, m4 + ADD_STORE_4P_2X 0, 1, 5, 6, 4 + lea outputq, [outputq + 2 * strideq] + ADD_STORE_4P_2X 2, 3, 5, 6, 4 + + RET diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c new file mode 100644 index 000000000..bf8150e2a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c @@ -0,0 +1,915 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include /* AVX2 */ + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +void aom_lpf_horizontal_edge_8_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + const __m128i thresh = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); + const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); + const __m128i blimit = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (aom_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + /* Filter1 >> 3 */ + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), + filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), + _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); + + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), + _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + } +} + +DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { + 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, + 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 +}; + +void aom_lpf_horizontal_edge_16_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, + p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; + + const __m128i thresh = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); + const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); + const __m128i blimit = + _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); + + p256_4 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); + p256_3 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); + p256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); + p256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); + p256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); + q256_0 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); + q256_1 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); + q256_2 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); + q256_3 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); + q256_4 = + _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); + + p4 = _mm256_castsi256_si128(p256_4); + p3 = _mm256_castsi256_si128(p256_3); + p2 = _mm256_castsi256_si128(p256_2); + p1 = _mm256_castsi256_si128(p256_1); + p0 = _mm256_castsi256_si128(p256_0); + q0 = _mm256_castsi256_si128(q256_0); + q1 = _mm256_castsi256_si128(q256_1); + q2 = _mm256_castsi256_si128(q256_2); + q3 = _mm256_castsi256_si128(q256_3); + q4 = _mm256_castsi256_si128(q256_4); + + { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, + flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, + flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (aom_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p256_5 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); + q256_5 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); + p5 = _mm256_castsi256_si128(p256_5); + q5 = _mm256_castsi256_si128(q256_5); + flat2 = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); + + flat2 = _mm_max_epu8(work, flat2); + p256_6 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); + q256_6 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); + p6 = _mm256_castsi256_si128(p256_6); + q6 = _mm256_castsi256_si128(q256_6); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); + + flat2 = _mm_max_epu8(work, flat2); + + p256_7 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); + q256_7 = _mm256_castpd_si256( + _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); + p7 = _mm256_castsi256_si128(p256_7); + q7 = _mm256_castsi256_si128(q256_7); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); + + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, + pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + const __m256i filter = + _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); + p256_7 = _mm256_shuffle_epi8(p256_7, filter); + p256_6 = _mm256_shuffle_epi8(p256_6, filter); + p256_5 = _mm256_shuffle_epi8(p256_5, filter); + p256_4 = _mm256_shuffle_epi8(p256_4, filter); + p256_3 = _mm256_shuffle_epi8(p256_3, filter); + p256_2 = _mm256_shuffle_epi8(p256_2, filter); + p256_1 = _mm256_shuffle_epi8(p256_1, filter); + p256_0 = _mm256_shuffle_epi8(p256_0, filter); + q256_0 = _mm256_shuffle_epi8(q256_0, filter); + q256_1 = _mm256_shuffle_epi8(q256_1, filter); + q256_2 = _mm256_shuffle_epi8(q256_2, filter); + q256_3 = _mm256_shuffle_epi8(q256_3, filter); + q256_4 = _mm256_shuffle_epi8(q256_4, filter); + q256_5 = _mm256_shuffle_epi8(q256_5, filter); + q256_6 = _mm256_shuffle_epi8(q256_6, filter); + q256_7 = _mm256_shuffle_epi8(q256_7, filter); + + pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), + _mm256_add_epi16(p256_4, p256_3)); + pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), + _mm256_add_epi16(q256_4, q256_3)); + + pixetFilter_p2p1p0 = + _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); + pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = + _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); + pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + + pixelFilter_p = _mm256_add_epi16( + eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); + + pixetFilter_p2p1p0 = _mm256_add_epi16( + four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4); + + flat2_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4); + + flat2_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(p256_3, p256_0)), + 3); + + flat_p0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(q256_3, q256_0)), + 3); + + flat_q0 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(p256_7, p256_7); + + sum_q7 = _mm256_add_epi16(q256_7, q256_7); + + sum_p3 = _mm256_add_epi16(p256_3, p256_3); + + sum_q3 = _mm256_add_epi16(q256_3, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4); + + flat2_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4); + + flat2_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); + + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_1)), + 3); + + flat_p1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_1)), + 3); + + flat_q1 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + sum_p3 = _mm256_add_epi16(sum_p3, p256_3); + + sum_q3 = _mm256_add_epi16(sum_q3, q256_3); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4); + + flat2_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4); + + flat2_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); + + pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); + + res_p = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, + _mm256_add_epi16(sum_p3, p256_2)), + 3); + + flat_p2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = + _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, + _mm256_add_epi16(sum_q3, q256_2)), + 3); + + flat_q2 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4); + + flat2_p3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4); + + flat2_q3 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4); + + flat2_p4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4); + + flat2_q4 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4); + + flat2_p5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4); + + flat2_q5 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + + sum_p7 = _mm256_add_epi16(sum_p7, p256_7); + + sum_q7 = _mm256_add_epi16(sum_q7, q256_7); + + pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); + + pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); + + res_p = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4); + + flat2_p6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); + + res_q = _mm256_srli_epi16( + _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4); + + flat2_q6 = _mm256_castsi256_si128( + _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + p2 = _mm_andnot_si128(flat, p2); + flat_p2 = _mm_and_si128(flat, flat_p2); + p2 = _mm_or_si128(flat_p2, p2); + + p1 = _mm_andnot_si128(flat, ps1); + flat_p1 = _mm_and_si128(flat, flat_p1); + p1 = _mm_or_si128(flat_p1, p1); + + p0 = _mm_andnot_si128(flat, ps0); + flat_p0 = _mm_and_si128(flat, flat_p0); + p0 = _mm_or_si128(flat_p0, p0); + + q0 = _mm_andnot_si128(flat, qs0); + flat_q0 = _mm_and_si128(flat, flat_q0); + q0 = _mm_or_si128(flat_q0, q0); + + q1 = _mm_andnot_si128(flat, qs1); + flat_q1 = _mm_and_si128(flat, flat_q1); + q1 = _mm_or_si128(flat_q1, q1); + + q2 = _mm_andnot_si128(flat, q2); + flat_q2 = _mm_and_si128(flat, flat_q2); + q2 = _mm_or_si128(flat_q2, q2); + + p6 = _mm_andnot_si128(flat2, p6); + flat2_p6 = _mm_and_si128(flat2, flat2_p6); + p6 = _mm_or_si128(flat2_p6, p6); + _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + + p5 = _mm_andnot_si128(flat2, p5); + flat2_p5 = _mm_and_si128(flat2, flat2_p5); + p5 = _mm_or_si128(flat2_p5, p5); + _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + + p4 = _mm_andnot_si128(flat2, p4); + flat2_p4 = _mm_and_si128(flat2, flat2_p4); + p4 = _mm_or_si128(flat2_p4, p4); + _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + + p3 = _mm_andnot_si128(flat2, p3); + flat2_p3 = _mm_and_si128(flat2, flat2_p3); + p3 = _mm_or_si128(flat2_p3, p3); + _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + + p2 = _mm_andnot_si128(flat2, p2); + flat2_p2 = _mm_and_si128(flat2, flat2_p2); + p2 = _mm_or_si128(flat2_p2, p2); + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + + p1 = _mm_andnot_si128(flat2, p1); + flat2_p1 = _mm_and_si128(flat2, flat2_p1); + p1 = _mm_or_si128(flat2_p1, p1); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + + p0 = _mm_andnot_si128(flat2, p0); + flat2_p0 = _mm_and_si128(flat2, flat2_p0); + p0 = _mm_or_si128(flat2_p0, p0); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + + q0 = _mm_andnot_si128(flat2, q0); + flat2_q0 = _mm_and_si128(flat2, flat2_q0); + q0 = _mm_or_si128(flat2_q0, q0); + _mm_storeu_si128((__m128i *)(s - 0 * p), q0); + + q1 = _mm_andnot_si128(flat2, q1); + flat2_q1 = _mm_and_si128(flat2, flat2_q1); + q1 = _mm_or_si128(flat2_q1, q1); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + + q2 = _mm_andnot_si128(flat2, q2); + flat2_q2 = _mm_and_si128(flat2, flat2_q2); + q2 = _mm_or_si128(flat2_q2, q2); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + + q3 = _mm_andnot_si128(flat2, q3); + flat2_q3 = _mm_and_si128(flat2, flat2_q3); + q3 = _mm_or_si128(flat2_q3, q3); + _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + + q4 = _mm_andnot_si128(flat2, q4); + flat2_q4 = _mm_and_si128(flat2, flat2_q4); + q4 = _mm_or_si128(flat2_q4, q4); + _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + + q5 = _mm_andnot_si128(flat2, q5); + flat2_q5 = _mm_and_si128(flat2, flat2_q5); + q5 = _mm_or_si128(flat2_q5, q5); + _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + + q6 = _mm_andnot_si128(flat2, q6); + flat2_q6 = _mm_and_si128(flat2, flat2_q6); + q6 = _mm_or_si128(flat2_q6, q6); + _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + } + _mm256_zeroupper(); +} diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c new file mode 100644 index 000000000..7e134dc63 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -0,0 +1,1892 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE2 + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" +#include "aom_ports/emmintrin_compat.h" + +static INLINE __m128i abs_diff(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); +} + +#if CONFIG_PARALLEL_DEBLOCKING +// filter_mask and hev_mask +#define FILTER_HEV_MASK4 \ + do { \ + /* (abs(q1 - q0), abs(p1 - p0) */ \ + __m128i flat = abs_diff(q1p1, q0p0); \ + /* abs(p1 - q1), abs(p0 - q0) */ \ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ + __m128i abs_p0q0, abs_p1q1; \ + \ + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ + hev = \ + _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ + hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_packs_epi16(hev, hev); \ + \ + /* const int8_t mask = filter_mask2(*limit, *blimit, */ \ + /* p1, p0, q0, q1); */ \ + abs_p0q0 = \ + _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ + abs_p1q1 = \ + _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ + mask = _mm_unpacklo_epi64(mask, flat); \ + mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_cmpeq_epi8(mask, zero); \ + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ + } while (0) +#endif // CONFIG_PARALLEL_DEBLOCKING + +// filter_mask and hev_mask +#define FILTER_HEV_MASK \ + do { \ + /* (abs(q1 - q0), abs(p1 - p0) */ \ + __m128i flat = abs_diff(q1p1, q0p0); \ + /* abs(p1 - q1), abs(p0 - q0) */ \ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ + __m128i abs_p0q0, abs_p1q1, work; \ + \ + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ + hev = \ + _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ + hev = _mm_cmpgt_epi16(hev, thresh); \ + hev = _mm_packs_epi16(hev, hev); \ + \ + /* const int8_t mask = filter_mask(*limit, *blimit, */ \ + /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ + abs_p0q0 = \ + _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ + abs_p1q1 = \ + _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ + /* abs(p3 - p2), abs(p2 - p1) */ \ + work = abs_diff(p3p2, p2p1); \ + flat = _mm_max_epu8(work, flat); \ + /* abs(q3 - q2), abs(q2 - q1) */ \ + work = abs_diff(q3q2, q2q1); \ + flat = _mm_max_epu8(work, flat); \ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ + mask = _mm_unpacklo_epi64(mask, flat); \ + mask = _mm_subs_epu8(mask, limit); \ + mask = _mm_cmpeq_epi8(mask, zero); \ + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ + } while (0) + +#define FILTER4 \ + do { \ + const __m128i t3t4 = \ + _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \ + const __m128i t80 = _mm_set1_epi8(0x80); \ + __m128i filter, filter2filter1, work; \ + \ + ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ + qs1qs0 = _mm_xor_si128(q1q0, t80); \ + \ + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ + work = _mm_subs_epi8(ps1ps0, qs1qs0); \ + filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); \ + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ + filter = _mm_and_si128(filter, mask); /* & mask */ \ + filter = _mm_unpacklo_epi64(filter, filter); \ + \ + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ + \ + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ + filter = _mm_unpacklo_epi8(filter, filter); \ + filter = _mm_srai_epi16(filter, 9); /* round */ \ + filter = _mm_packs_epi16(filter, filter); \ + filter = _mm_andnot_si128(hev, filter); \ + \ + hev = _mm_unpackhi_epi64(filter2filter1, filter); \ + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ + \ + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ + qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ + ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ + qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ + ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ + } while (0) + +void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i limit = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + const __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); +#if !CONFIG_PARALLEL_DEBLOCKING + __m128i p3p2, p2p1, q3q2, q2q1; +#endif // !CONFIG_PARALLEL_DEBLOCKING + __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; +#if !CONFIG_PARALLEL_DEBLOCKING + p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s - 4 * p))); +#endif // !CONFIG_PARALLEL_DEBLOCKING + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s + 0 * p))); +#if !CONFIG_PARALLEL_DEBLOCKING + q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); +#endif // !CONFIG_PARALLEL_DEBLOCKING + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); +#if !CONFIG_PARALLEL_DEBLOCKING + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); +#endif // !CONFIG_PARALLEL_DEBLOCKING +#if !CONFIG_PARALLEL_DEBLOCKING + FILTER_HEV_MASK; +#else // CONFIG_PARALLEL_DEBLOCKING + FILTER_HEV_MASK4; +#endif // !CONFIG_PARALLEL_DEBLOCKING + FILTER4; + + _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 +} + +void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i limit = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + const __m128i thresh = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i x0, x1, x2, x3; +#if !CONFIG_PARALLEL_DEBLOCKING + __m128i p3p2, p2p1, q3q2, q2q1; +#endif // !CONFIG_PARALLEL_DEBLOCKING + __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0; + __m128i mask, hev; + + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); + + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); + + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); + + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), + _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); + + // Transpose 8x8 + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + p1p0 = _mm_unpacklo_epi16(q1q0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x0 = _mm_unpacklo_epi16(x2, x3); +#if !CONFIG_PARALLEL_DEBLOCKING + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + p3p2 = _mm_unpacklo_epi32(p1p0, x0); +#endif // !CONFIG_PARALLEL_DEBLOCKING + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + p1p0 = _mm_unpackhi_epi32(p1p0, x0); +#if !CONFIG_PARALLEL_DEBLOCKING + p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high +#endif // !CONFIG_PARALLEL_DEBLOCKING + p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + q1q0 = _mm_unpackhi_epi16(q1q0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x2 = _mm_unpackhi_epi16(x2, x3); +#if !CONFIG_PARALLEL_DEBLOCKING + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + q3q2 = _mm_unpackhi_epi32(q1q0, x2); +#endif // !CONFIG_PARALLEL_DEBLOCKING + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + q1q0 = _mm_unpacklo_epi32(q1q0, x2); + + q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); + q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); +#if !CONFIG_PARALLEL_DEBLOCKING + p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); + q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); +#endif // !CONFIG_PARALLEL_DEBLOCKING +#if !CONFIG_PARALLEL_DEBLOCKING + FILTER_HEV_MASK; +#else // CONFIG_PARALLEL_DEBLOCKING + FILTER_HEV_MASK4; +#endif // !CONFIG_PARALLEL_DEBLOCKING + FILTER4; + + // Transpose 8x4 to 4x8 + // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 + // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8)); + // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 + x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); + // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); + + *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 4); + *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); + + *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 4); + *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); +} + +void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i abs_p1p0; + + q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + q4p4 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); + q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3p3 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); + q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2p2 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); + q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1p1 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0p0 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + abs_p0q0 = abs_diff(q0p0, p0q0); + abs_p1q1 = abs_diff(q1p1, p1q1); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi16(0x1); + __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); + __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); + __m128i qs0 = _mm_xor_si128(p0q0, t80); + __m128i qs1 = _mm_xor_si128(p1q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + + filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, qs0ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 0xB); + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 0xB); + + // Filter1 >> 3 + filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); + qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); + + // filt >> 1 + filt = _mm_adds_epi16(filter1, t1); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), + filt); + filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); + qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); + // loopfilter done + + { + __m128i work; + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5p5 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); + + q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6p6 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); + flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); + + q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7p7 = _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); + work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; + + p7_16 = _mm_unpacklo_epi8(q7p7, zero); + p6_16 = _mm_unpacklo_epi8(q6p6, zero); + p5_16 = _mm_unpacklo_epi8(q5p5, zero); + p4_16 = _mm_unpacklo_epi8(q4p4, zero); + p3_16 = _mm_unpacklo_epi8(q3p3, zero); + p2_16 = _mm_unpacklo_epi8(q2p2, zero); + p1_16 = _mm_unpacklo_epi8(q1p1, zero); + p0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_unpackhi_epi8(q0p0, zero); + q1_16 = _mm_unpackhi_epi8(q1p1, zero); + q2_16 = _mm_unpackhi_epi8(q2p2, zero); + q3_16 = _mm_unpackhi_epi8(q3p3, zero); + q4_16 = _mm_unpackhi_epi8(q4p4, zero); + q5_16 = _mm_unpackhi_epi8(q5p5, zero); + q6_16 = _mm_unpackhi_epi8(q6p6, zero); + q7_16 = _mm_unpackhi_epi8(q7p7, zero); + + pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), + _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), + _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(p7_16, p7_16); + sum_q7 = _mm_add_epi16(q7_16, q7_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + sum_p7 = _mm_add_epi16(sum_p7, p7_16); + sum_q7 = _mm_add_epi16(sum_q7, q7_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); + flat2_q6p6 = _mm_packus_epi16(res_p, res_q); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + q2p2 = _mm_andnot_si128(flat, q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + q2p2 = _mm_or_si128(q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + q6p6 = _mm_andnot_si128(flat2, q6p6); + flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); + q6p6 = _mm_or_si128(q6p6, flat2_q6p6); + _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); + _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + + q5p5 = _mm_andnot_si128(flat2, q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + q5p5 = _mm_or_si128(q5p5, flat2_q5p5); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + + q4p4 = _mm_andnot_si128(flat2, q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + q4p4 = _mm_or_si128(q4p4, flat2_q4p4); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + + q3p3 = _mm_andnot_si128(flat2, q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + q3p3 = _mm_or_si128(q3p3, flat2_q3p3); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + + q2p2 = _mm_andnot_si128(flat2, q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + q2p2 = _mm_or_si128(q2p2, flat2_q2p2); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + + q1p1 = _mm_andnot_si128(flat2, q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + q1p1 = _mm_or_si128(q1p1, flat2_q1p1); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + + q0p0 = _mm_andnot_si128(flat2, q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + q0p0 = _mm_or_si128(q0p0, flat2_q0p0); + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + } +} + +static INLINE __m128i filter_add2_sub2(const __m128i *const total, + const __m128i *const a1, + const __m128i *const a2, + const __m128i *const s1, + const __m128i *const s2) { + __m128i x = _mm_add_epi16(*a1, *total); + x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); + return x; +} + +static INLINE __m128i filter8_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f8_lo, + const __m128i *const f8_hi) { + const __m128i f8 = + _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3)); + const __m128i result = _mm_and_si128(*flat, f8); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +static INLINE __m128i filter16_mask(const __m128i *const flat, + const __m128i *const other_filt, + const __m128i *const f_lo, + const __m128i *const f_hi) { + const __m128i f = + _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4)); + const __m128i result = _mm_and_si128(*flat, f); + return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +} + +void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + + __m128i op2, op1, op0, oq0, oq1, oq2; + + __m128i max_abs_p1p0q1q0; + + p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); + p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); + p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); + p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); + q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); + q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + + { + const __m128i abs_p1p0 = abs_diff(p1, p0); + const __m128i abs_q1q0 = abs_diff(q1, q0); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i abs_p0q0 = abs_diff(p0, q0); + __m128i abs_p1q1 = abs_diff(p1, q1); + __m128i work; + max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + { + __m128i work; + work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); + flat = _mm_max_epu8(work, max_abs_p1p0q1q0); + work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); + flat2 = _mm_max_epu8(work, flat2); + work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + const __m128i ff = _mm_cmpeq_epi8(t4, t4); + + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + op1 = _mm_xor_si128(p1, t80); + op0 = _mm_xor_si128(p0, t80); + oq0 = _mm_xor_si128(q0, t80); + oq1 = _mm_xor_si128(q1, t80); + + hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); + + work_a = _mm_subs_epi8(oq0, op0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); + oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); + // loopfilter done + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // filter8 + { + const __m128i four = _mm_set1_epi16(4); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + __m128i f8_lo, f8_hi; + + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), + _mm_add_epi16(p3_lo, p2_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); + + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), + _mm_add_epi16(p3_hi, p2_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); + + op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); + op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); + op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); + oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); + oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); + + f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); + f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); + oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); + const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); + const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); + const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); + const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); + const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); + const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); + const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); + const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); + const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); + const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); + const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); + const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); + const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); + const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); + const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); + + const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); + const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); + const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); + const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); + const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); + const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); + const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); + const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); + const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); + const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); + const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); + const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); + const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); + const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); + const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); + const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); + + __m128i f_lo; + __m128i f_hi; + + f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 + f_lo = + _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), + _mm_add_epi16(p2_lo, p1_lo)); + f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); + f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); + + f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 + f_hi = + _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), + _mm_add_epi16(p2_hi, p1_hi)); + f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); + f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); + + p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + + f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); + p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + + f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); + p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + + f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); + p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + + f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); + op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + + f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); + op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + + f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); + op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); + oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); + oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); + oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); + q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); + q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); + q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + + f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); + f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); + q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); + _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + } +} + +void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; + + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + // filter_mask and hev_mask + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(q0p0, p0q0); + abs_p1q1 = abs_diff(q1p1, p1q1); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + } + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i ps1 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 11); + filter1 = _mm_packs_epi16(filter1, filter1); + + // Filter2 >> 3 + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 11); + filter2 = _mm_packs_epi16(filter2, zero); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + filt = _mm_unpacklo_epi8(zero, filt); + filt = _mm_srai_epi16(filt, 9); + filt = _mm_packs_epi16(filt, zero); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_loadl_epi64((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_loadl_epi64((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_loadl_epi64((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_loadl_epi64((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_loadl_epi64((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + } +} + +void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); + __m128i work; + + // filter_mask and hev_mask + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + int i = 0; + + do { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], + _mm_packus_epi16(workp_shft, workp_shft)); + + src += 8; + } while (++i < 2); + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } +} + +void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + const __m128i zero = _mm_set1_epi16(0); +#if !CONFIG_PARALLEL_DEBLOCKING + __m128i p3, p2, q2, q3; +#endif // !CONFIG_PARALLEL_DEBLOCKING + __m128i p1, p0, q0, q1; + __m128i mask, hev, flat; +#if !CONFIG_PARALLEL_DEBLOCKING + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); +#endif // !CONFIG_PARALLEL_DEBLOCKING + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); +#if !CONFIG_PARALLEL_DEBLOCKING + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); +#endif // !CONFIG_PARALLEL_DEBLOCKING + // filter_mask and hev_mask + { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); +#if !CONFIG_PARALLEL_DEBLOCKING + __m128i work; +#endif // !CONFIG_PARALLEL_DEBLOCKING + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); +#if !CONFIG_PARALLEL_DEBLOCKING + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8( + _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); +#endif // !CONFIG_PARALLEL_DEBLOCKING + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + const __m128i ps0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + const __m128i qs0 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + const __m128i qs1 = + _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + // Filter1 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + // Filter2 >> 3 + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + // filt >> 1 + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + } +} + +static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, + int in_p, unsigned char *out, int out_p) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i x8, x9, x10, x11, x12, x13, x14, x15; + + // 2-way interleave w/hoisting of unpacks + x0 = _mm_loadl_epi64((__m128i *)in0); // 1 + x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3 + x0 = _mm_unpacklo_epi8(x0, x1); // 1 + + x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5 + x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7 + x1 = _mm_unpacklo_epi8(x2, x3); // 2 + + x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9 + x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11 + x2 = _mm_unpacklo_epi8(x4, x5); // 3 + + x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13 + x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15 + x3 = _mm_unpacklo_epi8(x6, x7); // 4 + x4 = _mm_unpacklo_epi16(x0, x1); // 9 + + x8 = _mm_loadl_epi64((__m128i *)in1); // 2 + x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4 + x8 = _mm_unpacklo_epi8(x8, x9); // 5 + x5 = _mm_unpacklo_epi16(x2, x3); // 10 + + x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6 + x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8 + x9 = _mm_unpacklo_epi8(x10, x11); // 6 + + x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10 + x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12 + x10 = _mm_unpacklo_epi8(x12, x13); // 7 + x12 = _mm_unpacklo_epi16(x8, x9); // 11 + + x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14 + x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16 + x11 = _mm_unpacklo_epi8(x14, x15); // 8 + x13 = _mm_unpacklo_epi16(x10, x11); // 12 + + x6 = _mm_unpacklo_epi32(x4, x5); // 13 + x7 = _mm_unpackhi_epi32(x4, x5); // 14 + x14 = _mm_unpacklo_epi32(x12, x13); // 15 + x15 = _mm_unpackhi_epi32(x12, x13); // 16 + + // Store first 4-line result + _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); + + x4 = _mm_unpackhi_epi16(x0, x1); + x5 = _mm_unpackhi_epi16(x2, x3); + x12 = _mm_unpackhi_epi16(x8, x9); + x13 = _mm_unpackhi_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + // Store second 4-line result + _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); +} + +#if CONFIG_PARALLEL_DEBLOCKING +#define movq(p) _mm_loadl_epi64((const __m128i *)(p)) +#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1) +#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1) +#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1) +#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r) +#define pshufd(r, imm) _mm_shuffle_epi32(r, imm) +enum { ROTATE_DWORD_RIGHT = 0x39 }; +static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride, + const uint8_t *pSrc, + const ptrdiff_t srcStride) { + for (uint32_t idx = 0; idx < 2; idx += 1) { + __m128i r0, r1, r2, r3; + // load data + r0 = movq(pSrc); + r1 = movq(pSrc + srcStride); + r2 = movq(pSrc + srcStride * 2); + r3 = movq(pSrc + srcStride * 3); + // transpose + r0 = punpcklbw(r0, r1); + r2 = punpcklbw(r2, r3); + r1 = punpckhwd(r0, r2); + r0 = punpcklwd(r0, r2); + // store data + movd(pDst, r0); + r0 = pshufd(r0, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride, r0); + r0 = pshufd(r0, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride * 2, r0); + r0 = pshufd(r0, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride * 3, r0); + movd(pDst + dstStride * 4, r1); + r1 = pshufd(r1, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride * 5, r1); + r1 = pshufd(r1, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride * 6, r1); + r1 = pshufd(r1, ROTATE_DWORD_RIGHT); + movd(pDst + dstStride * 7, r1); + // advance the pointers + pDst += dstStride * 8; + pSrc += 8; + } +} + +#endif // CONFIG_PARALLEL_DEBLOCKING +static INLINE void transpose(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + do { + unsigned char *in = src[idx8x8]; + unsigned char *out = dst[idx8x8]; + + x0 = + _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + x1 = + _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + x0 = _mm_unpacklo_epi8(x0, x1); + + x2 = + _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + x3 = + _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(x2, x3); + + x4 = + _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + x5 = + _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(x4, x5); + + x6 = + _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + x7 = + _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(x6, x7); + + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + x4 = _mm_unpacklo_epi16(x0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x5 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + x6 = _mm_unpacklo_epi32(x4, x5); + _mm_storel_pd((double *)(out + 0 * out_p), + _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 + _mm_storeh_pd((double *)(out + 1 * out_p), + _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi32(x4, x5); + _mm_storel_pd((double *)(out + 2 * out_p), + _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 + _mm_storeh_pd((double *)(out + 3 * out_p), + _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi16(x0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi16(x2, x3); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + x6 = _mm_unpacklo_epi32(x4, x5); + _mm_storel_pd((double *)(out + 4 * out_p), + _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 + _mm_storeh_pd((double *)(out + 5 * out_p), + _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 6 * out_p), + _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 + _mm_storeh_pd((double *)(out + 7 * out_p), + _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); +#if !CONFIG_PARALLEL_DEBLOCKING + unsigned char *src[2]; + unsigned char *dst[2]; +#endif // !CONFIG_PARALLEL_DEBLOCKING + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); +#if !CONFIG_PARALLEL_DEBLOCKING + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + transpose(src, 16, dst, p, 2); +#else // CONFIG_PARALLEL_DEBLOCKING + transpose16x4(s - 2, p, t_dst + 16 * 2, 16); +#endif // !CONFIG_PARALLEL_DEBLOCKING +} + +void aom_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); + unsigned char *src[1]; + unsigned char *dst[1]; + + // Transpose 8x8 + src[0] = s - 4; + dst[0] = t_dst; + + transpose(src, p, dst, 8, 1); + + // Loop filtering + aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + dst[0] = s - 4; + + // Transpose back + transpose(src, 8, dst, p, 1); +} + +void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); + unsigned char *src[2]; + unsigned char *dst[2]; + + // Transpose 8x16 + transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, + blimit1, limit1, thresh1); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + transpose(src, 16, dst, p, 2); +} + +void aom_lpf_vertical_16_sse2(unsigned char *s, int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]); + unsigned char *src[2]; + unsigned char *dst[2]; + + src[0] = s - 8; + src[1] = s; + dst[0] = t_dst; + dst[1] = t_dst + 8 * 8; + + // Transpose 16x8 + transpose(src, p, dst, 8, 2); + + // Loop filtering + aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); + + src[0] = t_dst; + src[1] = t_dst + 8 * 8; + dst[0] = s - 8; + dst[1] = s; + + // Transpose back + transpose(src, 8, dst, p, 2); +} + +void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { + DECLARE_ALIGNED(16, unsigned char, t_dst[256]); + + // Transpose 16x16 + transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); + transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); + + // Loop filtering + aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); + + // Transpose back + transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); + transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); +} diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c new file mode 100644 index 000000000..5166e9e0a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_ports/mem.h" +#include "./aom_config.h" +#include "aom/aom_integer.h" + +static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) { + __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); + __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); + return _mm_unpacklo_epi64(temp1, temp2); +} + +static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) { + __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr); + __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride)); + __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2); + temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2)); + temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3)); + temp1 = _mm_unpacklo_epi32(temp1, temp2); + return _mm_unpacklo_epi64(temp3, temp1); +} + +static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height); + +static INLINE unsigned int masked_sad8xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +static INLINE unsigned int masked_sad4xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +#define MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \ + m, n); \ + } + +#if CONFIG_EXT_PARTITION +MASKSADMXN_SSSE3(128, 128) +MASKSADMXN_SSSE3(128, 64) +MASKSADMXN_SSSE3(64, 128) +#endif // CONFIG_EXT_PARTITION +MASKSADMXN_SSSE3(64, 64) +MASKSADMXN_SSSE3(64, 32) +MASKSADMXN_SSSE3(32, 64) +MASKSADMXN_SSSE3(32, 32) +MASKSADMXN_SSSE3(32, 16) +MASKSADMXN_SSSE3(16, 32) +MASKSADMXN_SSSE3(16, 16) +MASKSADMXN_SSSE3(16, 8) + +#define MASKSAD8XN_SSSE3(n) \ + unsigned int aom_masked_sad8x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } + +MASKSAD8XN_SSSE3(16) +MASKSAD8XN_SSSE3(8) +MASKSAD8XN_SSSE3(4) + +#define MASKSAD4XN_SSSE3(n) \ + unsigned int aom_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } + +MASKSAD4XN_SSSE3(8) +MASKSAD4XN_SSSE3(4) + +// For width a multiple of 16 +// Assumes values in m are <=64 +static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height) { + int y, x; + __m128i a, b, m, temp1, temp2; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // For each row + for (y = 0; y < height; y++) { + // Covering the full width + for (x = 0; x < width; x += 16) { + // Load a, b, m in xmm registers + a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); + b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); + m = _mm_loadu_si128((const __m128i *)(m_ptr + x)); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + temp2 = _mm_maddubs_epi16(temp1, m); + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one)); + } + // Move onto the next row + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int masked_sad8xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { + int y; + __m128i a, b, m, temp1, temp2, row_res; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // Add the masked SAD for 2 rows at a time + for (y = 0; y < height; y += 2) { + // Load a, b, m in xmm registers + a = width8_load_2rows(a_ptr, a_stride); + b = width8_load_2rows(b_ptr, b_stride); + m = width8_load_2rows(m_ptr, m_stride); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + row_res = _mm_maddubs_epi16(temp1, m); + + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); + + // Move onto the next rows + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int masked_sad4xh_ssse3( + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { + int y; + __m128i a, b, m, temp1, temp2, row_res; + __m128i res = _mm_setzero_si128(); + __m128i one = _mm_set1_epi16(1); + // Add the masked SAD for 4 rows at a time + for (y = 0; y < height; y += 4) { + // Load a, b, m in xmm registers + a = width4_load_4rows(a_ptr, a_stride); + b = width4_load_4rows(b_ptr, b_stride); + m = width4_load_4rows(m_ptr, m_stride); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu8(a, b); + temp2 = _mm_subs_epu8(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + row_res = _mm_maddubs_epi16(temp1, m); + + // Pad out row result to 32 bit integers & add to running total + res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); + + // Move onto the next rows + a_ptr += a_stride * 4; + b_ptr += b_stride * 4; + m_ptr += m_stride * 4; + } + // Pad out row result to 32 bit integers & add to running total + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +#if CONFIG_HIGHBITDEPTH +static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr, + int stride) { + __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); + __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); + return _mm_unpacklo_epi64(temp1, temp2); +} + +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, int height); + +static INLINE unsigned int highbd_masked_sad4xh_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height); + +#define HIGHBD_MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, m, n); \ + } + +#if CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN_SSSE3(128, 128) +HIGHBD_MASKSADMXN_SSSE3(128, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 128) +#endif // CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN_SSSE3(64, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 64) +HIGHBD_MASKSADMXN_SSSE3(32, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 32) +HIGHBD_MASKSADMXN_SSSE3(16, 16) +HIGHBD_MASKSADMXN_SSSE3(16, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 16) +HIGHBD_MASKSADMXN_SSSE3(8, 8) +HIGHBD_MASKSADMXN_SSSE3(8, 4) + +#define HIGHBD_MASKSAD4XN_SSSE3(n) \ + unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *msk, int msk_stride) { \ + return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ + msk_stride, n); \ + } + +HIGHBD_MASKSAD4XN_SSSE3(8) +HIGHBD_MASKSAD4XN_SSSE3(4) + +// For width a multiple of 8 +// Assumes values in m are <=64 +static INLINE unsigned int highbd_masked_sad_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, int height) { + int y, x; + __m128i a, b, m, temp1, temp2; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); + __m128i res = _mm_setzero_si128(); + // For each row + for (y = 0; y < height; y++) { + // Covering the full width + for (x = 0; x < width; x += 8) { + // Load a, b, m in xmm registers + a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); + b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); + m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)), + _mm_setzero_si128()); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu16(a, b); + temp2 = _mm_subs_epu16(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Add result of multiplying by m and add pairs together to running total + res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); + } + // Move onto the next row + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} + +static INLINE unsigned int highbd_masked_sad4xh_ssse3( + const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int height) { + int y; + __m128i a, b, m, temp1, temp2; + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); + __m128i res = _mm_setzero_si128(); + // Add the masked SAD for 2 rows at a time + for (y = 0; y < height; y += 2) { + // Load a, b, m in xmm registers + a = highbd_width4_load_2rows(a_ptr, a_stride); + b = highbd_width4_load_2rows(b_ptr, b_stride); + temp1 = _mm_loadl_epi64((const __m128i *)m_ptr); + temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)); + m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2), + _mm_setzero_si128()); + + // Calculate the difference between a & b + temp1 = _mm_subs_epu16(a, b); + temp2 = _mm_subs_epu16(b, a); + temp1 = _mm_or_si128(temp1, temp2); + + // Multiply by m and add together + res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); + + // Move onto the next rows + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; + } + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + res = _mm_hadd_epi32(res, _mm_setzero_si128()); + // sad = (sad + 31) >> 6; + return (_mm_cvtsi128_si32(res) + 31) >> 6; +} +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c new file mode 100644 index 000000000..fe14597f6 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -0,0 +1,1948 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_filter.h" + +// Half pixel shift +#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS / 2) + +/***************************************************************************** + * Horizontal additions + *****************************************************************************/ + +static INLINE int32_t hsum_epi32_si32(__m128i v_d) { + v_d = _mm_hadd_epi32(v_d, v_d); + v_d = _mm_hadd_epi32(v_d, v_d); + return _mm_cvtsi128_si32(v_d); +} + +static INLINE int64_t hsum_epi64_si64(__m128i v_q) { + v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); +#if ARCH_X86_64 + return _mm_cvtsi128_si64(v_q); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_q); + return tmp; + } +#endif +} + +#if CONFIG_HIGHBITDEPTH +static INLINE int64_t hsum_epi32_si64(__m128i v_d) { + const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); + const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); + const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); + return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); +} +#endif // CONFIG_HIGHBITDEPTH + +static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q, + uint32_t *sse, int w, int h) { + int64_t sum64; + uint64_t sse64; + + // Horizontal sum + sum64 = hsum_epi32_si32(v_sum_d); + sse64 = hsum_epi64_si64(v_sse_q); + + sum64 = (sum64 >= 0) ? sum64 : -sum64; + + // Round + sum64 = ROUND_POWER_OF_TWO(sum64, 6); + sse64 = ROUND_POWER_OF_TWO(sse64, 12); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute the variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} + +/***************************************************************************** + * n*16 Wide versions + *****************************************************************************/ + +static INLINE unsigned int masked_variancewxh_ssse3( + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { + int ii, jj; + + const __m128i v_zero = _mm_setzero_si128(); + + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + + assert((w % 16) == 0); + + for (ii = 0; ii < h; ii++) { + for (jj = 0; jj < w; jj += 16) { + // Load inputs - 8 bits + const __m128i v_a_b = _mm_loadu_si128((const __m128i *)(a + jj)); + const __m128i v_b_b = _mm_loadu_si128((const __m128i *)(b + jj)); + const __m128i v_m_b = _mm_loadu_si128((const __m128i *)(m + jj)); + + // Unpack to 16 bits - still containing max 8 bits + const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero); + const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero); + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero); + const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero); + const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero); + + // Difference: [-255, 255] + const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w); + const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w); + + // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits + const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w); + const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w); + const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w); + const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w); + + // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits + const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w); + const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w); + + // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits + const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d); + + // Unpack Squared error to 64 bits + const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); + const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); + + // Accumulate + v_sum_d = _mm_add_epi32(v_sum_d, v_e0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_e1_d); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q); + } + + // Move on to next row + a += a_stride; + b += b_stride; + m += m_stride; + } + + return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); +} + +#define MASKED_VARWXH(W, H) \ + unsigned int aom_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + return masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, m_stride, W, \ + H, sse); \ + } + +MASKED_VARWXH(16, 8) +MASKED_VARWXH(16, 16) +MASKED_VARWXH(16, 32) +MASKED_VARWXH(32, 16) +MASKED_VARWXH(32, 32) +MASKED_VARWXH(32, 64) +MASKED_VARWXH(64, 32) +MASKED_VARWXH(64, 64) +#if CONFIG_EXT_PARTITION +MASKED_VARWXH(64, 128) +MASKED_VARWXH(128, 64) +MASKED_VARWXH(128, 128) +#endif // CONFIG_EXT_PARTITION + +/***************************************************************************** + * 8 Wide versions + *****************************************************************************/ + +static INLINE unsigned int masked_variance8xh_ssse3( + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int h, unsigned int *sse) { + int ii; + + const __m128i v_zero = _mm_setzero_si128(); + + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + + for (ii = 0; ii < h; ii++) { + // Load inputs - 8 bits + const __m128i v_a_b = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b_b = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)m); + + // Unpack to 16 bits - still containing max 8 bits + const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero); + const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero); + const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); + + // Difference: [-255, 255] + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + + // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits + const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w); + const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); + + // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits + const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w); + + // Unpack Squared error to 64 bits + const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); + const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); + + // Accumulate + v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q); + + // Move on to next row + a += a_stride; + b += b_stride; + m += m_stride; + } + + return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); +} + +#define MASKED_VAR8XH(H) \ + unsigned int aom_masked_variance8x##H##_ssse3( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + return masked_variance8xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \ + sse); \ + } + +MASKED_VAR8XH(4) +MASKED_VAR8XH(8) +MASKED_VAR8XH(16) + +/***************************************************************************** + * 4 Wide versions + *****************************************************************************/ + +static INLINE unsigned int masked_variance4xh_ssse3( + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int h, unsigned int *sse) { + int ii; + + const __m128i v_zero = _mm_setzero_si128(); + + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + + assert((h % 2) == 0); + + for (ii = 0; ii < h / 2; ii++) { + // Load 2 input rows - 8 bits + const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t *)a); + const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t *)b); + const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m); + const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t *)(a + a_stride)); + const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t *)(b + b_stride)); + const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride)); + + // Interleave 2 rows into a single register + const __m128i v_a_b = _mm_unpacklo_epi32(v_a0_b, v_a1_b); + const __m128i v_b_b = _mm_unpacklo_epi32(v_b0_b, v_b1_b); + const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b); + + // Unpack to 16 bits - still containing max 8 bits + const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero); + const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero); + const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); + + // Difference: [-255, 255] + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + + // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits + const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w); + const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); + + // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits + const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w); + + // Unpack Squared error to 64 bits + const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); + const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); + + // Accumulate + v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q); + + // Move on to next 2 row + a += a_stride * 2; + b += b_stride * 2; + m += m_stride * 2; + } + + return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); +} + +#define MASKED_VAR4XH(H) \ + unsigned int aom_masked_variance4x##H##_ssse3( \ + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + return masked_variance4xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \ + sse); \ + } + +MASKED_VAR4XH(4) +MASKED_VAR4XH(8) + +#if CONFIG_HIGHBITDEPTH + +// Main calculation for n*8 wide blocks +static INLINE void highbd_masked_variance64_ssse3( + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, int64_t *sum, uint64_t *sse) { + int ii, jj; + + const __m128i v_zero = _mm_setzero_si128(); + + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + + assert((w % 8) == 0); + + for (ii = 0; ii < h; ii++) { + for (jj = 0; jj < w; jj += 8) { + // Load inputs - 8 bits + const __m128i v_a_w = _mm_loadu_si128((const __m128i *)(a + jj)); + const __m128i v_b_w = _mm_loadu_si128((const __m128i *)(b + jj)); + const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)(m + jj)); + + // Unpack m to 16 bits - still containing max 8 bits + const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); + + // Difference: [-4095, 4095] + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + + // Error - [-4095, 4095] * [0, 64] => sum of 2 of these fits in 19 bits + const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); + + // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit) + const __m128i v_absd_w = _mm_abs_epi16(v_d_w); + const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero); + const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero); + const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d); + const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero); + const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero); + const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d); + // Square and sum the errors -> 36bits * 4 = 38bits + __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d; + v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d); + v_elo1_d = _mm_srli_si128(v_elo_d, 4); + v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d); + v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q); + v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d); + v_ehi3_d = _mm_srli_si128(v_ehi_d, 4); + v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d); + v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q); + v_se_q = _mm_add_epi64(v_se0_q, v_se1_q); + + // Accumulate + v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_q); + } + + // Move on to next row + a += a_stride; + b += b_stride; + m += m_stride; + } + + // Horizontal sum + *sum = hsum_epi32_si64(v_sum_d); + *sse = hsum_epi64_si64(v_sse_q); + + // Round + *sum = (*sum >= 0) ? *sum : -*sum; + *sum = ROUND_POWER_OF_TWO(*sum, 6); + *sse = ROUND_POWER_OF_TWO(*sse, 12); +} + +// Main calculation for 4 wide blocks +static INLINE void highbd_masked_variance64_4wide_ssse3( + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int h, int64_t *sum, uint64_t *sse) { + int ii; + + const __m128i v_zero = _mm_setzero_si128(); + + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + + assert((h % 2) == 0); + + for (ii = 0; ii < h / 2; ii++) { + // Load 2 input rows - 8 bits + const __m128i v_a0_w = _mm_loadl_epi64((const __m128i *)a); + const __m128i v_b0_w = _mm_loadl_epi64((const __m128i *)b); + const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m); + const __m128i v_a1_w = _mm_loadl_epi64((const __m128i *)(a + a_stride)); + const __m128i v_b1_w = _mm_loadl_epi64((const __m128i *)(b + b_stride)); + const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride)); + + // Interleave 2 rows into a single register + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0_w, v_a1_w); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0_w, v_b1_w); + const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b); + + // Unpack to 16 bits - still containing max 8 bits + const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); + + // Difference: [-4095, 4095] + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + + // Error - [-4095, 4095] * [0, 64] => fits in 19 bits (incld sign bit) + const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); + + // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit) + const __m128i v_absd_w = _mm_abs_epi16(v_d_w); + const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero); + const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero); + const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d); + const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero); + const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero); + const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d); + // Square and sum the errors -> 36bits * 4 = 38bits + __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d; + v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d); + v_elo1_d = _mm_srli_si128(v_elo_d, 4); + v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d); + v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q); + v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d); + v_ehi3_d = _mm_srli_si128(v_ehi_d, 4); + v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d); + v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q); + v_se_q = _mm_add_epi64(v_se0_q, v_se1_q); + + // Accumulate + v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); + v_sse_q = _mm_add_epi64(v_sse_q, v_se_q); + + // Move on to next row + a += a_stride * 2; + b += b_stride * 2; + m += m_stride * 2; + } + + // Horizontal sum + *sum = hsum_epi32_si32(v_sum_d); + *sse = hsum_epi64_si64(v_sse_q); + + // Round + *sum = (*sum >= 0) ? *sum : -*sum; + *sum = ROUND_POWER_OF_TWO(*sum, 6); + *sse = ROUND_POWER_OF_TWO(*sse, 12); +} + +static INLINE unsigned int highbd_masked_variancewxh_ssse3( + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { + uint64_t sse64; + int64_t sum64; + + if (w == 4) + highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, + h, &sum64, &sse64); + else + highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, + &sum64, &sse64); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute and return variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} + +static INLINE unsigned int highbd_10_masked_variancewxh_ssse3( + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { + uint64_t sse64; + int64_t sum64; + + if (w == 4) + highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, + h, &sum64, &sse64); + else + highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, + &sum64, &sse64); + + // Normalise + sum64 = ROUND_POWER_OF_TWO(sum64, 2); + sse64 = ROUND_POWER_OF_TWO(sse64, 4); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute and return variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} + +static INLINE unsigned int highbd_12_masked_variancewxh_ssse3( + const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, + const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { + uint64_t sse64; + int64_t sum64; + + if (w == 4) + highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, + h, &sum64, &sse64); + else + highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, + &sum64, &sse64); + + sum64 = ROUND_POWER_OF_TWO(sum64, 4); + sse64 = ROUND_POWER_OF_TWO(sse64, 8); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute and return variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} + +#define HIGHBD_MASKED_VARWXH(W, H) \ + unsigned int aom_highbd_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ + uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ + return highbd_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ + m_stride, W, H, sse); \ + } \ + \ + unsigned int aom_highbd_10_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ + uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ + return highbd_10_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ + m_stride, W, H, sse); \ + } \ + \ + unsigned int aom_highbd_12_masked_variance##W##x##H##_ssse3( \ + const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ + const uint8_t *m, int m_stride, unsigned int *sse) { \ + uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ + uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ + return highbd_12_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ + m_stride, W, H, sse); \ + } + +HIGHBD_MASKED_VARWXH(4, 4) +HIGHBD_MASKED_VARWXH(4, 8) +HIGHBD_MASKED_VARWXH(8, 4) +HIGHBD_MASKED_VARWXH(8, 8) +HIGHBD_MASKED_VARWXH(8, 16) +HIGHBD_MASKED_VARWXH(16, 8) +HIGHBD_MASKED_VARWXH(16, 16) +HIGHBD_MASKED_VARWXH(16, 32) +HIGHBD_MASKED_VARWXH(32, 16) +HIGHBD_MASKED_VARWXH(32, 32) +HIGHBD_MASKED_VARWXH(32, 64) +HIGHBD_MASKED_VARWXH(64, 32) +HIGHBD_MASKED_VARWXH(64, 64) +#if CONFIG_EXT_PARTITION +HIGHBD_MASKED_VARWXH(64, 128) +HIGHBD_MASKED_VARWXH(128, 64) +HIGHBD_MASKED_VARWXH(128, 128) +#endif // CONFIG_EXT_PARTITION + +#endif + +////////////////////////////////////////////////////////////////////////////// +// Sub pixel versions +////////////////////////////////////////////////////////////////////////////// + +typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b, + __m128i v_filter_b); + +static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b, + const __m128i v_filter_b) { + (void)v_filter_b; + return _mm_avg_epu8(v_a_b, v_b_b); +} + +static INLINE __m128i apply_filter(const __m128i v_a_b, const __m128i v_b_b, + const __m128i v_filter_b) { + const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1)); + __m128i v_input_lo_b = _mm_unpacklo_epi8(v_a_b, v_b_b); + __m128i v_input_hi_b = _mm_unpackhi_epi8(v_a_b, v_b_b); + __m128i v_temp0_w = _mm_maddubs_epi16(v_input_lo_b, v_filter_b); + __m128i v_temp1_w = _mm_maddubs_epi16(v_input_hi_b, v_filter_b); + __m128i v_res_lo_w = + _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS); + __m128i v_res_hi_w = + _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w), FILTER_BITS); + return _mm_packus_epi16(v_res_lo_w, v_res_hi_w); +} + +// Apply the filter to the contents of the lower half of a and b +static INLINE void apply_filter_lo(const __m128i v_a_lo_b, + const __m128i v_b_lo_b, + const __m128i v_filter_b, __m128i *v_res_w) { + const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1)); + __m128i v_input_b = _mm_unpacklo_epi8(v_a_lo_b, v_b_lo_b); + __m128i v_temp0_w = _mm_maddubs_epi16(v_input_b, v_filter_b); + *v_res_w = + _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS); +} + +static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b, + const __m128i v_m_b, __m128i *v_sum_d, + __m128i *v_sse_q) { + const __m128i v_zero = _mm_setzero_si128(); + // Unpack to 16 bits - still containing max 8 bits + const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero); + const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero); + const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero); + const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero); + const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero); + const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero); + + // Difference: [-255, 255] + const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w); + const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w); + + // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits + const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w); + const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w); + const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w); + const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w); + + // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits + const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w); + const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w); + + // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits + const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d); + + // Unpack Squared error to 64 bits + const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); + const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); + + // Accumulate + *v_sum_d = _mm_add_epi32(*v_sum_d, v_e0_d); + *v_sum_d = _mm_add_epi32(*v_sum_d, v_e1_d); + *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_lo_q); + *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_hi_q); +} + +// Functions for width (W) >= 16 +unsigned int aom_masked_subpel_varWxH_xzero(const uint8_t *src, int src_stride, + int yoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int w, int h, + filter_fn_t filter_fn) { + int i, j; + __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filter_b = _mm_set1_epi16( + (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + for (j = 0; j < w; j += 16) { + // Load the first row ready + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); + // Process 2 rows at a time + for (i = 0; i < h; i += 2) { + // Load the next row apply the filter + v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride)); + v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b); + // Load the dst and msk for the variance calculation + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next row apply the filter + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2)); + v_res_b = filter_fn(v_src1_b, v_src0_b, v_filter_b); + // Load the dst and msk for the variance calculation + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j + msk_stride)); + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next block of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + // Reset to the top of the block + src -= src_stride * h; + dst -= dst_stride * h; + msk -= msk_stride * h; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); +} +unsigned int aom_masked_subpel_varWxH_yzero(const uint8_t *src, int src_stride, + int xoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int w, int h, + filter_fn_t filter_fn) { + int i, j; + __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filter_b = _mm_set1_epi16( + (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i++) { + for (j = 0; j < w; j += 16) { + // Load this row and one below & apply the filter to them + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1)); + v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b); + + // Load the dst and msk for the variance calculation + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + } + src += src_stride; + dst += dst_stride; + msk += msk_stride; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); +} +unsigned int aom_masked_subpel_varWxH_xnonzero_ynonzero( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int w, int h, filter_fn_t xfilter_fn, + filter_fn_t yfilter_fn) { + int i, j; + __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b; + __m128i v_filtered0_b, v_filtered1_b, v_res_b, v_dst_b, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filterx_b = _mm_set1_epi16( + (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]); + const __m128i v_filtery_b = _mm_set1_epi16( + (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (j = 0; j < w; j += 16) { + // Load the first row ready + v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1)); + v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b); + // Process 2 rows at a time + for (i = 0; i < h; i += 2) { + // Load the next row & apply the filter + v_src2_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j)); + v_src3_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1)); + v_filtered1_b = xfilter_fn(v_src2_b, v_src3_b, v_filterx_b); + // Load the dst and msk for the variance calculation + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); + // Complete the calculation for this row and add it to the running total + v_res_b = yfilter_fn(v_filtered0_b, v_filtered1_b, v_filtery_b); + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next row & apply the filter + v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j)); + v_src1_b = + _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1)); + v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b); + // Load the dst and msk for the variance calculation + v_dst_b = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j)); + v_msk_b = _mm_loadu_si128((const __m128i *)(msk + msk_stride + j)); + // Complete the calculation for this row and add it to the running total + v_res_b = yfilter_fn(v_filtered1_b, v_filtered0_b, v_filtery_b); + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next block of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + // Reset to the top of the block + src -= src_stride * h; + dst -= dst_stride * h; + msk -= msk_stride * h; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); +} + +// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, +// xmm[63:32] = row 3, xmm[31:0] = row 4 +unsigned int aom_masked_subpel_var4xH_xzero(const uint8_t *src, int src_stride, + int yoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { + int i; + __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered1_w, v_filtered2_w; + __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b; + __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + // Load the first row of src data ready + v_src0_b = _mm_loadl_epi64((const __m128i *)src); + for (i = 0; i < h; i += 4) { + // Load the rest of the source data for these rows + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); + v_src1_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); + v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); + v_src3_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); + v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); + // Load the dst data + v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); + v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); + v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); + v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); + v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); + v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); + v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); + // Load the mask data + v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); + v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); + v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); + v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); + v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); + v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); + v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); + // Apply the y filter + if (yoffset == HALF_PIXEL_OFFSET) { + v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b); + v_src2_b = + _mm_or_si128(_mm_slli_si128(v_src1_b, 4), + _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0))); + v_res_b = _mm_avg_epu8(v_src1_b, v_src2_b); + } else { + v_src2_b = + _mm_or_si128(_mm_slli_si128(v_src1_b, 4), + _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0))); + apply_filter_lo(v_src1_b, v_src2_b, v_filter_b, &v_filtered1_w); + v_src2_b = + _mm_or_si128(_mm_slli_si128(v_src3_b, 4), + _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0))); + apply_filter_lo(v_src3_b, v_src2_b, v_filter_b, &v_filtered2_w); + v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered1_w); + } + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 4; + dst += dst_stride * 4; + msk += msk_stride * 4; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); +} + +// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2 +unsigned int aom_masked_subpel_var8xH_xzero(const uint8_t *src, int src_stride, + int yoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { + int i; + __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_res_b; + __m128i v_dst_b = _mm_setzero_si128(); + __m128i v_msk_b = _mm_setzero_si128(); + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + // Load the first row of src data ready + v_src0_b = _mm_loadl_epi64((const __m128i *)src); + for (i = 0; i < h; i += 2) { + if (yoffset == HALF_PIXEL_OFFSET) { + // Load the rest of the source data for these rows + v_src1_b = _mm_or_si128( + _mm_slli_si128(v_src0_b, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 1))); + v_src0_b = _mm_or_si128( + _mm_slli_si128(v_src1_b, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 2))); + // Apply the y filter + v_res_b = _mm_avg_epu8(v_src1_b, v_src0_b); + } else { + // Load the data and apply the y filter + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); + apply_filter_lo(v_src0_b, v_src1_b, v_filter_b, &v_filtered0_w); + v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + apply_filter_lo(v_src1_b, v_src0_b, v_filter_b, &v_filtered1_w); + v_res_b = _mm_packus_epi16(v_filtered1_w, v_filtered0_w); + } + // Load the dst data + v_dst_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0))); + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); +} + +// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, +// xmm[63:32] = row 3, xmm[31:0] = row 4 +unsigned int aom_masked_subpel_var4xH_yzero(const uint8_t *src, int src_stride, + int xoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { + int i; + __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w; + __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b; + __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b; + __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i += 4) { + // Load the src data + v_src0_b = _mm_loadl_epi64((const __m128i *)src); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); + v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b); + v_src2_shift_b = _mm_srli_si128(v_src2_b, 1); + v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); + v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); + v_src3_shift_b = _mm_srli_si128(v_src3_b, 1); + v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b); + // Load the dst data + v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); + v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); + v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); + v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); + v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); + v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); + v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); + // Load the mask data + v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); + v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); + v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); + v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); + v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); + v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); + v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b); + v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b); + v_res_b = _mm_avg_epu8(v_src0_b, v_src0_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w); + apply_filter_lo(v_src2_b, v_src2_shift_b, v_filter_b, &v_filtered2_w); + v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered0_w); + } + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 4; + dst += dst_stride * 4; + msk += msk_stride * 4; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); +} + +unsigned int aom_masked_subpel_var8xH_yzero(const uint8_t *src, int src_stride, + int xoffset, const uint8_t *dst, + int dst_stride, const uint8_t *msk, + int msk_stride, unsigned int *sse, + int h) { + int i; + __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w; + __m128i v_src0_shift_b, v_src1_shift_b, v_res_b, v_dst_b, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i += 2) { + // Load the src data + v_src0_b = _mm_loadu_si128((const __m128i *)(src)); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride)); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); + v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); + v_res_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w); + apply_filter_lo(v_src1_b, v_src1_shift_b, v_filter_b, &v_filtered1_w); + v_res_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); + } + // Load the dst data + v_dst_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); +} + +// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, +// xmm[63:32] = row 3, xmm[31:0] = row 4 +unsigned int aom_masked_subpel_var4xH_xnonzero_ynonzero( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int h) { + int i; + __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w; + __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b; + __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b, v_temp_b; + __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_extra_row_b, v_res_b; + __m128i v_xres_b[2]; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); + __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + assert(yoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i += 4) { + // Load the src data + v_src0_b = _mm_loadl_epi64((const __m128i *)src); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); + v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b); + v_src2_shift_b = _mm_srli_si128(v_src2_b, 1); + v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); + v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); + v_src3_shift_b = _mm_srli_si128(v_src3_b, 1); + v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b); + v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b); + v_xres_b[i == 0 ? 0 : 1] = _mm_avg_epu8(v_src0_b, v_src0_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); + apply_filter_lo(v_src2_b, v_src2_shift_b, v_filterx_b, &v_filtered2_w); + v_xres_b[i == 0 ? 0 : 1] = _mm_packus_epi16(v_filtered2_w, v_filtered0_w); + } + // Move onto the next set of rows + src += src_stride * 4; + } + // Load one more row to be used in the y filter + v_src0_b = _mm_loadl_epi64((const __m128i *)src); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_extra_row_b = _mm_and_si128(_mm_avg_epu8(v_src0_b, v_src0_shift_b), + _mm_setr_epi32(-1, 0, 0, 0)); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); + v_extra_row_b = + _mm_and_si128(_mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()), + _mm_setr_epi32(-1, 0, 0, 0)); + } + + for (i = 0; i < h; i += 4) { + if (h == 8 && i == 0) { + v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[0], 4), + _mm_srli_si128(v_xres_b[1], 12)); + } else { + v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[i == 0 ? 0 : 1], 4), + v_extra_row_b); + } + // Apply the y filter + if (yoffset == HALF_PIXEL_OFFSET) { + v_res_b = _mm_avg_epu8(v_xres_b[i == 0 ? 0 : 1], v_temp_b); + } else { + v_res_b = apply_filter(v_xres_b[i == 0 ? 0 : 1], v_temp_b, v_filtery_b); + } + + // Load the dst data + v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); + v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); + v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); + v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); + v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); + v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); + v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); + // Load the mask data + v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); + v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); + v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); + v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); + v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); + v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); + v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + dst += dst_stride * 4; + msk += msk_stride * 4; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); +} + +unsigned int aom_masked_subpel_var8xH_xnonzero_ynonzero( + const uint8_t *src, int src_stride, int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int h) { + int i; + __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_dst_b, v_msk_b; + __m128i v_src0_shift_b, v_src1_shift_b; + __m128i v_xres0_b, v_xres1_b, v_res_b, v_temp_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + + bilinear_filters_2t[xoffset][0]); + __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + + bilinear_filters_2t[yoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + assert(yoffset < BIL_SUBPEL_SHIFTS); + // Load the first block of src data + v_src0_b = _mm_loadu_si128((const __m128i *)(src)); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride)); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); + v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); + v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); + apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w); + v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); + } + for (i = 0; i < h; i += 4) { + // Load the next block of src data + v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2)); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 3)); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); + v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); + v_xres1_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); + apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w); + v_xres1_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); + } + // Apply the y filter to the previous block + v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres0_b, 8), + _mm_slli_si128(v_xres1_b, 8)); + if (yoffset == HALF_PIXEL_OFFSET) { + v_res_b = _mm_avg_epu8(v_xres0_b, v_temp_b); + } else { + v_res_b = apply_filter(v_xres0_b, v_temp_b, v_filtery_b); + } + // Load the dst data + v_dst_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next block of src data + v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 4)); + v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); + v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 5)); + v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); + v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); + v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); + } else { + apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); + apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w); + v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); + } + // Apply the y filter to the previous block + v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres1_b, 8), + _mm_slli_si128(v_xres0_b, 8)); + if (yoffset == HALF_PIXEL_OFFSET) { + v_res_b = _mm_avg_epu8(v_xres1_b, v_temp_b); + } else { + v_res_b = apply_filter(v_xres1_b, v_temp_b, v_filtery_b); + } + // Load the dst data + v_dst_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3))); + // Compute the sum and SSE + sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 4; + dst += dst_stride * 4; + msk += msk_stride * 4; + } + return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); +} + +// For W >=16 +#define MASK_SUBPIX_VAR_LARGE(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + assert(W % 16 == 0); \ + if (xoffset == 0) { \ + if (yoffset == 0) \ + return aom_masked_variance##W##x##H##_ssse3( \ + src, src_stride, dst, dst_stride, msk, msk_stride, sse); \ + else if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_masked_subpel_varWxH_xzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter_avg); \ + else \ + return aom_masked_subpel_varWxH_xzero(src, src_stride, yoffset, dst, \ + dst_stride, msk, msk_stride, \ + sse, W, H, apply_filter); \ + } else if (yoffset == 0) { \ + if (xoffset == HALF_PIXEL_OFFSET) \ + return aom_masked_subpel_varWxH_yzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter_avg); \ + else \ + return aom_masked_subpel_varWxH_yzero(src, src_stride, xoffset, dst, \ + dst_stride, msk, msk_stride, \ + sse, W, H, apply_filter); \ + } else if (xoffset == HALF_PIXEL_OFFSET) { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \ + dst_stride, msk, msk_stride, sse, W, H, apply_filter_avg, \ + apply_filter_avg); \ + else \ + return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter_avg, apply_filter); \ + } else { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter, apply_filter_avg); \ + else \ + return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, apply_filter, apply_filter); \ + } \ + } + +// For W < 16 +#define MASK_SUBPIX_VAR_SMALL(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + assert(W == 4 || W == 8); \ + if (xoffset == 0 && yoffset == 0) \ + return aom_masked_variance##W##x##H##_ssse3( \ + src, src_stride, dst, dst_stride, msk, msk_stride, sse); \ + else if (xoffset == 0) \ + return aom_masked_subpel_var##W##xH_xzero( \ + src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H); \ + else if (yoffset == 0) \ + return aom_masked_subpel_var##W##xH_yzero( \ + src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H); \ + else \ + return aom_masked_subpel_var##W##xH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \ + sse, H); \ + } + +MASK_SUBPIX_VAR_SMALL(4, 4) +MASK_SUBPIX_VAR_SMALL(4, 8) +MASK_SUBPIX_VAR_SMALL(8, 4) +MASK_SUBPIX_VAR_SMALL(8, 8) +MASK_SUBPIX_VAR_SMALL(8, 16) +MASK_SUBPIX_VAR_LARGE(16, 8) +MASK_SUBPIX_VAR_LARGE(16, 16) +MASK_SUBPIX_VAR_LARGE(16, 32) +MASK_SUBPIX_VAR_LARGE(32, 16) +MASK_SUBPIX_VAR_LARGE(32, 32) +MASK_SUBPIX_VAR_LARGE(32, 64) +MASK_SUBPIX_VAR_LARGE(64, 32) +MASK_SUBPIX_VAR_LARGE(64, 64) +#if CONFIG_EXT_PARTITION +MASK_SUBPIX_VAR_LARGE(64, 128) +MASK_SUBPIX_VAR_LARGE(128, 64) +MASK_SUBPIX_VAR_LARGE(128, 128) +#endif // CONFIG_EXT_PARTITION + +#if CONFIG_HIGHBITDEPTH +typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q, + uint32_t *sse, int w, int h); +typedef unsigned int (*highbd_variance_fn_t)(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, + unsigned int *sse); +typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w, + __m128i v_filter_w); + +static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w, + const __m128i v_b_w, + const __m128i v_filter_w) { + (void)v_filter_w; + return _mm_avg_epu16(v_a_w, v_b_w); +} + +static INLINE __m128i highbd_apply_filter(const __m128i v_a_w, + const __m128i v_b_w, + const __m128i v_filter_w) { + const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1)); + __m128i v_input_lo_w = _mm_unpacklo_epi16(v_a_w, v_b_w); + __m128i v_input_hi_w = _mm_unpackhi_epi16(v_a_w, v_b_w); + __m128i v_temp0_d = _mm_madd_epi16(v_input_lo_w, v_filter_w); + __m128i v_temp1_d = _mm_madd_epi16(v_input_hi_w, v_filter_w); + __m128i v_res_lo_d = + _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS); + __m128i v_res_hi_d = + _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d), FILTER_BITS); + return _mm_packs_epi32(v_res_lo_d, v_res_hi_d); +} +// Apply the filter to the contents of the lower half of a and b +static INLINE void highbd_apply_filter_lo(const __m128i v_a_lo_w, + const __m128i v_b_lo_w, + const __m128i v_filter_w, + __m128i *v_res_d) { + const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1)); + __m128i v_input_w = _mm_unpacklo_epi16(v_a_lo_w, v_b_lo_w); + __m128i v_temp0_d = _mm_madd_epi16(v_input_w, v_filter_w); + *v_res_d = + _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS); +} + +static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w, + const __m128i v_m_b, __m128i *v_sum_d, + __m128i *v_sse_q) { + const __m128i v_zero = _mm_setzero_si128(); + const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); + + // Difference: [-2^12, 2^12] => 13 bits (incld sign bit) + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + + // Error - [-4095, 4095] * [0, 64] & sum pairs => fits in 19 + 1 bits + const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); + + // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit) + const __m128i v_absd_w = _mm_abs_epi16(v_d_w); + const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero); + const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero); + const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d); + const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero); + const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero); + const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d); + // Square and sum the errors -> 36bits * 4 = 38bits + __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d; + v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d); + v_elo1_d = _mm_srli_si128(v_elo_d, 4); + v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d); + v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q); + v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d); + v_ehi3_d = _mm_srli_si128(v_ehi_d, 4); + v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d); + v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q); + v_se_q = _mm_add_epi64(v_se0_q, v_se1_q); + + // Accumulate + *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d); + *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q); +} + +static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d, + __m128i v_sse_q, + uint32_t *sse, int w, + int h) { + int64_t sum64; + uint64_t sse64; + + // Horizontal sum + sum64 = hsum_epi32_si32(v_sum_d); + sse64 = hsum_epi64_si64(v_sse_q); + + sum64 = (sum64 >= 0) ? sum64 : -sum64; + + // Round + sum64 = ROUND_POWER_OF_TWO(sum64, 6); + sse64 = ROUND_POWER_OF_TWO(sse64, 12); + + // Normalise + sum64 = ROUND_POWER_OF_TWO(sum64, 2); + sse64 = ROUND_POWER_OF_TWO(sse64, 4); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute the variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} +static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d, + __m128i v_sse_q, + uint32_t *sse, int w, + int h) { + int64_t sum64; + uint64_t sse64; + + // Horizontal sum + sum64 = hsum_epi32_si64(v_sum_d); + sse64 = hsum_epi64_si64(v_sse_q); + + sum64 = (sum64 >= 0) ? sum64 : -sum64; + + // Round + sum64 = ROUND_POWER_OF_TWO(sum64, 6); + sse64 = ROUND_POWER_OF_TWO(sse64, 12); + + // Normalise + sum64 = ROUND_POWER_OF_TWO(sum64, 4); + sse64 = ROUND_POWER_OF_TWO(sse64, 8); + + // Store the SSE + *sse = (uint32_t)sse64; + // Compute the variance + return *sse - (uint32_t)((sum64 * sum64) / (w * h)); +} + +// High bit depth functions for width (W) >= 8 +unsigned int aom_highbd_masked_subpel_varWxH_xzero( + const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int w, int h, highbd_filter_fn_t filter_fn, + highbd_calc_masked_var_t calc_var) { + int i, j; + __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filter_w = + _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + for (j = 0; j < w; j += 8) { + // Load the first row ready + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); + // Process 2 rows at a time + for (i = 0; i < h; i += 2) { + // Load the next row apply the filter + v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride)); + v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w); + // Load the dst and msk for the variance calculation + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next row apply the filter + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2)); + v_res_w = filter_fn(v_src1_w, v_src0_w, v_filter_w); + // Load the dst and msk for the variance calculation + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j + msk_stride)); + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next block of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + // Reset to the top of the block + src -= src_stride * h; + dst -= dst_stride * h; + msk -= msk_stride * h; + } + return calc_var(v_sum_d, v_sse_q, sse, w, h); +} +unsigned int aom_highbd_masked_subpel_varWxH_yzero( + const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int w, int h, highbd_filter_fn_t filter_fn, + highbd_calc_masked_var_t calc_var) { + int i, j; + __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filter_w = + _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i++) { + for (j = 0; j < w; j += 8) { + // Load this row & apply the filter to them + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1)); + v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w); + + // Load the dst and msk for the variance calculation + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + } + src += src_stride; + dst += dst_stride; + msk += msk_stride; + } + return calc_var(v_sum_d, v_sse_q, sse, w, h); +} + +unsigned int aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( + const uint16_t *src, int src_stride, int xoffset, int yoffset, + const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn, + highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) { + int i, j; + __m128i v_src0_w, v_src1_w, v_src2_w, v_src3_w; + __m128i v_filtered0_w, v_filtered1_w, v_res_w, v_dst_w, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + const __m128i v_filterx_w = + _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); + const __m128i v_filtery_w = + _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + assert(yoffset < BIL_SUBPEL_SHIFTS); + for (j = 0; j < w; j += 8) { + // Load the first row ready + v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1)); + v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w); + // Process 2 rows at a time + for (i = 0; i < h; i += 2) { + // Load the next row & apply the filter + v_src2_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j)); + v_src3_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1)); + v_filtered1_w = xfilter_fn(v_src2_w, v_src3_w, v_filterx_w); + // Load the dst and msk for the variance calculation + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); + // Complete the calculation for this row and add it to the running total + v_res_w = yfilter_fn(v_filtered0_w, v_filtered1_w, v_filtery_w); + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next row & apply the filter + v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j)); + v_src1_w = + _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1)); + v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w); + // Load the dst and msk for the variance calculation + v_dst_w = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j)); + v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + msk_stride + j)); + // Complete the calculation for this row and add it to the running total + v_res_w = yfilter_fn(v_filtered1_w, v_filtered0_w, v_filtery_w); + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next block of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + // Reset to the top of the block + src -= src_stride * h; + dst -= dst_stride * h; + msk -= msk_stride * h; + } + return calc_var(v_sum_d, v_sse_q, sse, w, h); +} + +// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2 +unsigned int aom_highbd_masked_subpel_var4xH_xzero( + const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int h, highbd_calc_masked_var_t calc_var) { + int i; + __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_res_w; + __m128i v_dst_w, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); + assert(yoffset < BIL_SUBPEL_SHIFTS); + // Load the first row of src data ready + v_src0_w = _mm_loadl_epi64((const __m128i *)src); + for (i = 0; i < h; i += 2) { + if (yoffset == HALF_PIXEL_OFFSET) { + // Load the rest of the source data for these rows + v_src1_w = _mm_or_si128( + _mm_slli_si128(v_src0_w, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 1))); + v_src0_w = _mm_or_si128( + _mm_slli_si128(v_src1_w, 8), + _mm_loadl_epi64((const __m128i *)(src + src_stride * 2))); + // Apply the y filter + v_res_w = _mm_avg_epu16(v_src1_w, v_src0_w); + } else { + // Load the data and apply the y filter + v_src1_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); + highbd_apply_filter_lo(v_src0_w, v_src1_w, v_filter_w, &v_filtered0_d); + v_src0_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); + highbd_apply_filter_lo(v_src1_w, v_src0_w, v_filter_w, &v_filtered1_d); + v_res_w = _mm_packs_epi32(v_filtered1_d, v_filtered0_d); + } + // Load the dst data + v_dst_w = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi32( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0))); + // Compute the sum and SSE + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + return calc_var(v_sum_d, v_sse_q, sse, 4, h); +} + +unsigned int aom_highbd_masked_subpel_var4xH_yzero( + const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst, + int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, + int h, highbd_calc_masked_var_t calc_var) { + int i; + __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d; + __m128i v_src0_shift_w, v_src1_shift_w, v_res_w, v_dst_w, v_msk_b; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + for (i = 0; i < h; i += 2) { + // Load the src data + v_src0_w = _mm_loadu_si128((const __m128i *)(src)); + v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride)); + v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); + v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); + v_res_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); + } else { + highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filter_w, + &v_filtered0_d); + highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filter_w, + &v_filtered1_d); + v_res_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); + } + // Load the dst data + v_dst_w = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi32( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); + // Compute the sum and SSE + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 2; + dst += dst_stride * 2; + msk += msk_stride * 2; + } + return calc_var(v_sum_d, v_sse_q, sse, 4, h); +} + +unsigned int aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero( + const uint16_t *src, int src_stride, int xoffset, int yoffset, + const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, + unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) { + int i; + __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_dst_w, v_msk_b; + __m128i v_src0_shift_w, v_src1_shift_w; + __m128i v_xres0_w, v_xres1_w, v_res_w, v_temp_w; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_q = _mm_setzero_si128(); + __m128i v_filterx_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + + bilinear_filters_2t[xoffset][0]); + __m128i v_filtery_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + + bilinear_filters_2t[yoffset][0]); + assert(xoffset < BIL_SUBPEL_SHIFTS); + assert(yoffset < BIL_SUBPEL_SHIFTS); + // Load the first block of src data + v_src0_w = _mm_loadu_si128((const __m128i *)(src)); + v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride)); + v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); + v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); + v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); + } else { + highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w, + &v_filtered0_d); + highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w, + &v_filtered1_d); + v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); + } + for (i = 0; i < h; i += 4) { + // Load the next block of src data + v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2)); + v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 3)); + v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); + v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); + v_xres1_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); + } else { + highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w, + &v_filtered0_d); + highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w, + &v_filtered1_d); + v_xres1_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); + } + // Apply the y filter to the previous block + v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres0_w, 8), + _mm_slli_si128(v_xres1_w, 8)); + if (yoffset == HALF_PIXEL_OFFSET) { + v_res_w = _mm_avg_epu16(v_xres0_w, v_temp_w); + } else { + v_res_w = highbd_apply_filter(v_xres0_w, v_temp_w, v_filtery_w); + } + // Load the dst data + v_dst_w = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi32( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); + // Compute the sum and SSE + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + + // Load the next block of src data + v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 4)); + v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); + v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 5)); + v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); + // Apply the x filter + if (xoffset == HALF_PIXEL_OFFSET) { + v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); + v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); + v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); + } else { + highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w, + &v_filtered0_d); + highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w, + &v_filtered1_d); + v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); + } + // Apply the y filter to the previous block + v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres1_w, 8), + _mm_slli_si128(v_xres0_w, 8)); + if (yoffset == HALF_PIXEL_OFFSET) { + v_res_w = _mm_avg_epu16(v_xres1_w, v_temp_w); + } else { + v_res_w = highbd_apply_filter(v_xres1_w, v_temp_w, v_filtery_w); + } + // Load the dst data + v_dst_w = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)), + _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3))); + // Load the mask data + v_msk_b = _mm_unpacklo_epi32( + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)), + _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3))); + // Compute the sum and SSE + highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + // Move onto the next set of rows + src += src_stride * 4; + dst += dst_stride * 4; + msk += msk_stride * 4; + } + return calc_var(v_sum_d, v_sse_q, sse, 4, h); +} + +// For W >=8 +#define HIGHBD_MASK_SUBPIX_VAR_LARGE(W, H) \ + unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse, highbd_calc_masked_var_t calc_var, \ + highbd_variance_fn_t full_variance_function) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + assert(W % 8 == 0); \ + if (xoffset == 0) { \ + if (yoffset == 0) \ + return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \ + msk_stride, sse); \ + else if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_highbd_masked_subpel_varWxH_xzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \ + else \ + return aom_highbd_masked_subpel_varWxH_xzero( \ + src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, \ + W, H, highbd_apply_filter, calc_var); \ + } else if (yoffset == 0) { \ + if (xoffset == HALF_PIXEL_OFFSET) \ + return aom_highbd_masked_subpel_varWxH_yzero( \ + src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \ + else \ + return aom_highbd_masked_subpel_varWxH_yzero( \ + src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, \ + W, H, highbd_apply_filter, calc_var); \ + } else if (xoffset == HALF_PIXEL_OFFSET) { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \ + dst_stride, msk, msk_stride, sse, W, H, highbd_apply_filter_avg, \ + highbd_apply_filter_avg, calc_var); \ + else \ + return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter_avg, \ + highbd_apply_filter, calc_var); \ + } else { \ + if (yoffset == HALF_PIXEL_OFFSET) \ + return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter, \ + highbd_apply_filter_avg, calc_var); \ + else \ + return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \ + msk_stride, sse, W, H, highbd_apply_filter, highbd_apply_filter, \ + calc_var); \ + } \ + } + +// For W < 8 +#define HIGHBD_MASK_SUBPIX_VAR_SMALL(W, H) \ + unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse, highbd_calc_masked_var_t calc_var, \ + highbd_variance_fn_t full_variance_function) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + assert(W == 4); \ + if (xoffset == 0 && yoffset == 0) \ + return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \ + msk_stride, sse); \ + else if (xoffset == 0) \ + return aom_highbd_masked_subpel_var4xH_xzero( \ + src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H, \ + calc_var); \ + else if (yoffset == 0) \ + return aom_highbd_masked_subpel_var4xH_yzero( \ + src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H, \ + calc_var); \ + else \ + return aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero( \ + src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \ + sse, H, calc_var); \ + } + +#define HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(W, H) \ + unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ + sse, calc_masked_variance, \ + aom_highbd_masked_variance##W##x##H##_ssse3); \ + } \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ + sse, highbd_10_calc_masked_variance, \ + aom_highbd_10_masked_variance##W##x##H##_ssse3); \ + } \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ + unsigned int *sse) { \ + return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ + src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ + sse, highbd_12_calc_masked_variance, \ + aom_highbd_12_masked_variance##W##x##H##_ssse3); \ + } + +HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 4) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 4) +HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 8) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 8) +HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 4) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 4) +HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 8) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 8) +HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 16) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 16) +HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 8) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 8) +HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 16) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 16) +HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 32) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 32) +HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 16) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 16) +HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 32) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 32) +HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 64) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 64) +HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 32) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 32) +HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 64) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 64) +#if CONFIG_EXT_PARTITION +HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 128) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 128) +HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 64) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 64) +HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 128) +HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 128) +#endif // CONFIG_EXT_PARTITION +#endif diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c new file mode 100644 index 000000000..ad77f974c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + const int height) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + do { + const __m128i v_p_b = xx_loadl_32(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, const int width, + const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); + const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); + + // Rounded absolute difference + const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); + const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); + v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +#if CONFIG_EXT_PARTITION +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +#endif // CONFIG_EXT_PARTITION +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +#if CONFIG_HIGHBITDEPTH +static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + do { + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m128i v_sad_d = _mm_setzero_si128(); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); + const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); + + // Rounded absolute difference + const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); + const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); + + v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); + v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + return xx_hsum_epi32_si32(v_sad_d); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +#if CONFIG_EXT_PARTITION +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +#endif // CONFIG_EXT_PARTITION +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c new file mode 100644 index 000000000..efb3659cf --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -0,0 +1,355 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_config.h" +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/aom_filter.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int h) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_b = xx_loadl_32(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + const int pre_step = pre_stride - w; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p1_b = xx_loadl_32(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_b = xx_loadl_32(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); + const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + + const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); + const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 8; + + if (n % w == 0) pre += pre_step; + } while (n < w * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +#if CONFIG_EXT_PARTITION +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +#endif // CONFIG_EXT_PARTITION +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +#if CONFIG_HIGHBITDEPTH +static INLINE void hbd_obmc_variance_w4( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_w = xx_loadl_64(pre + n); + const __m128i v_m_d = xx_load_128(mask + n); + const __m128i v_w_d = xx_load_128(wsrc + n); + + const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +static INLINE void hbd_obmc_variance_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w, + const int h) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - w; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p1_w = xx_loadl_64(pre + n + 4); + const __m128i v_m1_d = xx_load_128(mask + n + 4); + const __m128i v_w1_d = xx_load_128(wsrc + n + 4); + const __m128i v_p0_w = xx_loadl_64(pre + n); + const __m128i v_m0_d = xx_load_128(mask + n); + const __m128i v_w0_d = xx_load_128(wsrc + n); + + const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); + const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); + const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); + + const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); + const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); + + const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12); + const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12); + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 8; + + if (n % w == 0) pre += pre_step; + } while (n < w * h); + + *sum += xx_hsum_epi32_si64(v_sum_d); + *sse += xx_hsum_epi32_si64(v_sse_d); +} + +static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } + *sum = (int)sum64; + *sse = (unsigned int)sse64; +} + +static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } + *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); +} + +static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, + const int32_t *mask, int w, int h, + unsigned int *sse, int *sum) { + int64_t sum64 = 0; + uint64_t sse64 = 0; + if (w == 128) { + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128, + 32); + pre8 += 32 * pre_stride; + wsrc += 32 * 128; + mask += 32 * 128; + h -= 32; + } while (h > 0); + } else if (w == 64 && h >= 128) { + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64, + 64); + pre8 += 64 * pre_stride; + wsrc += 64 * 64; + mask += 64 * 64; + h -= 64; + } while (h > 0); + } else if (w == 4) { + hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); + } else { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } + *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); + *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); +} + +#define HBD_OBMCVARWXH(W, H) \ + unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } \ + \ + unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } \ + \ + unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + int64_t var; \ + highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#if CONFIG_EXT_PARTITION +HBD_OBMCVARWXH(128, 128) +HBD_OBMCVARWXH(128, 64) +HBD_OBMCVARWXH(64, 128) +#endif // CONFIG_EXT_PARTITION +HBD_OBMCVARWXH(64, 64) +HBD_OBMCVARWXH(64, 32) +HBD_OBMCVARWXH(32, 64) +HBD_OBMCVARWXH(32, 32) +HBD_OBMCVARWXH(32, 16) +HBD_OBMCVARWXH(16, 32) +HBD_OBMCVARWXH(16, 16) +HBD_OBMCVARWXH(16, 8) +HBD_OBMCVARWXH(8, 16) +HBD_OBMCVARWXH(8, 8) +HBD_OBMCVARWXH(8, 4) +HBD_OBMCVARWXH(4, 8) +HBD_OBMCVARWXH(4, 4) +#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm new file mode 100644 index 000000000..954a95b98 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm @@ -0,0 +1,547 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + + vzeroupper + + ; If we can skip this block, then just zero the output + cmp skipmp, 0 + jne .blank + +%ifnidn %1, b_32x32 + + ; Special case for ncoeff == 16, as it is frequent and we can save on + ; not setting up a loop. + cmp ncoeffmp, 16 + jne .generic + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Special case of ncoeff == 16 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +.single: + + movifnidn coeffq, coeffmp + movifnidn zbinq, zbinmp + mova m0, [zbinq] ; m0 = zbin + + ; Get DC and first 15 AC coeffs - in this special case, that is all. +%if CONFIG_HIGHBITDEPTH + ; coeff stored as 32bit numbers but we process them as 16 bit numbers + mova m9, [coeffq] + packssdw m9, [coeffq+16] ; m9 = c[i] + mova m10, [coeffq+32] + packssdw m10, [coeffq+48] ; m10 = c[i] +%else + mova m9, [coeffq] ; m9 = c[i] + mova m10, [coeffq+16] ; m10 = c[i] +%endif + + mov r0, eobmp ; Output pointer + mov r1, qcoeffmp ; Output pointer + mov r2, dqcoeffmp ; Output pointer + + pxor m5, m5 ; m5 = dedicated zero + + pcmpeqw m4, m4 ; All word lanes -1 + paddw m0, m4 ; m0 = zbin - 1 + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, we just write zeros + ; to the outputs and we are done. + por m14, m7, m12 + ptest m14, m14 + jnz .single_nonzero + +%if CONFIG_HIGHBITDEPTH + mova [r1 ], ymm5 + mova [r1+32], ymm5 + mova [r2 ], ymm5 + mova [r2+32], ymm5 +%else + mova [r1], ymm5 + mova [r2], ymm5 +%endif + mov [r0], word 0 + + vzeroupper + RET + +.single_nonzero: + + ; Actual quantization of size 16 block - setup pointers, rounders, etc. + movifnidn r4, roundmp + movifnidn r5, quantmp + mov r3, dequantmp + mov r6, shiftmp + mova m1, [r4] ; m1 = round + mova m2, [r5] ; m2 = quant + mova m3, [r3] ; m3 = dequant + mova m4, [r6] ; m4 = shift + + mov r3, iscanmp + + DEFINE_ARGS eob, qcoeff, dqcoeff, iscan + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + +%if CONFIG_HIGHBITDEPTH + ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [qcoeffq ], m11 + mova [qcoeffq+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+32], m11 + mova [qcoeffq+48], m6 +%else + mova [qcoeffq ], m8 + mova [qcoeffq+16], m13 +%endif + + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q + +%if CONFIG_HIGHBITDEPTH + ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [dqcoeffq ], m11 + mova [dqcoeffq+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+32], m11 + mova [dqcoeffq+48], m6 +%else + mova [dqcoeffq ], m8 + mova [dqcoeffq+16], m13 +%endif + + mova m6, [iscanq] ; m6 = scan[i] + mova m11, [iscanq+16] ; m11 = scan[i] + + pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 + psubw m6, m6, m7 ; m6 = scan[i] + 1 + psubw m11, m11, m12 ; m11 = scan[i] + 1 + pandn m8, m8, m6 ; m8 = max(eob) + pandn m13, m13, m11 ; m13 = max(eob) + pmaxsw m8, m8, m13 + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [eobq], ax + + vzeroupper + RET + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Generic case of ncoeff != 16 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +.generic: + +%endif ; %ifnidn %1, b_32x32 + +DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ + qcoeff, dqcoeff, dequant, eob, scan, iscan + + ; Actual quantization loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m0, [zbinq] ; m0 = zbin + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant + mova m3, [r2] ; m3 = dequant + pcmpeqw m4, m4 ; All lanes -1 +%ifidn %1, b_32x32 + psubw m0, m4 + psubw m1, m4 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + paddw m0, m4 ; m0 = m0 + 1 + + mov r2, shiftmp + mov r3, qcoeffmp + mova m4, [r2] ; m4 = shift + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, b_32x32 + psllw m4, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob + +%if CONFIG_HIGHBITDEPTH + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] +%else + lea coeffq, [ coeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + lea dqcoeffq, [dqcoeffq+ncoeffq*2] +%endif + lea iscanq, [ iscanq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs +%if CONFIG_HIGHBITDEPTH + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [coeffq+ncoeffq*4+ 0] + packssdw m9, [coeffq+ncoeffq*4+16] + mova m10, [coeffq+ncoeffq*4+32] + packssdw m10, [coeffq+ncoeffq*4+48] +%else + mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] +%endif + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, skip forward quickly. + por m14, m7, m12 + ptest m14, m14 + jnz .first_nonzero + +%if CONFIG_HIGHBITDEPTH + mova [qcoeffq+ncoeffq*4 ], ymm5 + mova [qcoeffq+ncoeffq*4+32], ymm5 + mova [dqcoeffq+ncoeffq*4 ], ymm5 + mova [dqcoeffq+ncoeffq*4+32], ymm5 +%else + mova [qcoeffq+ncoeffq*2], ymm5 + mova [dqcoeffq+ncoeffq*2], ymm5 +%endif + + add ncoeffq, mmsize + + punpckhqdq m1, m1 + punpckhqdq m2, m2 + punpckhqdq m3, m3 + punpckhqdq m4, m4 + pxor m8, m8 + + jmp .ac_only_loop + +.first_nonzero: + + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 + +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 +%else + mova [qcoeffq+ncoeffq*2+ 0], m8 + mova [qcoeffq+ncoeffq*2+16], m13 +%endif + +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif + +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m8 + punpckhwd m6, m8, m6 + pmovsxwd m11, m8 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 +%else + mova [dqcoeffq+ncoeffq*2+ 0], m8 + mova [dqcoeffq+ncoeffq*2+16], m13 +%endif + + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] + mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + +.ac_only_loop: + +%if CONFIG_HIGHBITDEPTH + ; pack coeff from 32bit to 16bit array + mova m9, [coeffq+ncoeffq*4+ 0] + packssdw m9, [coeffq+ncoeffq*4+16] + mova m10, [coeffq+ncoeffq*4+32] + packssdw m10, [coeffq+ncoeffq*4+48] +%else + mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] +%endif + + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + + ; Check if all coeffs are less than zbin. If yes, skip this itertion. + ; And just write zeros as the result would be. + por m14, m7, m12 + ptest m14, m14 + jnz .rest_nonzero + +%if CONFIG_HIGHBITDEPTH + mova [qcoeffq+ncoeffq*4+ 0], ymm5 + mova [qcoeffq+ncoeffq*4+32], ymm5 + mova [dqcoeffq+ncoeffq*4+ 0], ymm5 + mova [dqcoeffq+ncoeffq*4+32], ymm5 +%else + mova [qcoeffq+ncoeffq*2+ 0], ymm5 + mova [dqcoeffq+ncoeffq*2+ 0], ymm5 +%endif + add ncoeffq, mmsize + jnz .ac_only_loop + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [r2], ax + vzeroupper + RET + +.rest_nonzero: + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m14, m4 ; m14 = m14*qsh>>16 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 + +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m14 + punpckhwd m6, m14, m6 + pmovsxwd m11, m14 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 +%else + mova [qcoeffq+ncoeffq*2+ 0], m14 + mova [qcoeffq+ncoeffq*2+16], m13 +%endif + +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif + +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pcmpgtw m6, m5, m14 + punpckhwd m6, m14, m6 + pmovsxwd m11, m14 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pcmpgtw m6, m5, m13 + punpckhwd m6, m13, m6 + pmovsxwd m11, m13 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 +%else + mova [dqcoeffq+ncoeffq*2+ 0], m14 + mova [dqcoeffq+ncoeffq*2+16], m13 +%endif + + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jnz .ac_only_loop + + ; Horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + movq rax, m8 + mov [r2], ax + vzeroupper + RET + + ; Skip-block, i.e. just write all zeroes +.blank: + +DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ + qcoeff, dqcoeff, dequant, eob, scan, iscan + + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + +DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob + +%if CONFIG_HIGHBITDEPTH + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] +%else + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] +%endif + + neg ncoeffq + pxor m7, m7 + +.blank_loop: +%if CONFIG_HIGHBITDEPTH + mova [dqcoeffq+ncoeffq*4+ 0], ymm7 + mova [dqcoeffq+ncoeffq*4+32], ymm7 + mova [qcoeffq+ncoeffq*4+ 0], ymm7 + mova [qcoeffq+ncoeffq*4+32], ymm7 +%else + mova [dqcoeffq+ncoeffq*2+ 0], ymm7 + mova [qcoeffq+ncoeffq*2+ 0], ymm7 +%endif + add ncoeffq, mmsize + jl .blank_loop + + mov [eobq], word 0 + + vzeroupper + RET +%endmacro + +INIT_XMM avx +QUANTIZE_FN b, 7 +QUANTIZE_FN b_32x32, 7 + +END diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c new file mode 100644 index 000000000..890c1f01e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { +#if CONFIG_HIGHBITDEPTH + return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); +#else + return _mm_load_si128((const __m128i *)coeff_ptr); +#endif +} + +static INLINE void store_coefficients(__m128i coeff_vals, + tran_low_t *coeff_ptr) { +#if CONFIG_HIGHBITDEPTH + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); +#else + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); +#endif +} + +void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + __m128i zero; + (void)scan_ptr; + + coeff_ptr += n_coeffs; + iscan_ptr += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + zero = _mm_setzero_si128(); + if (!skip_block) { + __m128i eob; + __m128i zbin; + __m128i round, quant, dequant, shift; + { + __m128i coeff0, coeff1; + + // Setup global values + { + __m128i pw_1; + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + pw_1 = _mm_set1_epi16(1); + zbin = _mm_sub_epi16(zbin, pw_1); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + } + + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + __m128i cmp_mask0, cmp_mask1; + // Do DC and first 15 AC + coeff0 = load_coefficients(coeff_ptr + n_coeffs); + coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + round = _mm_unpackhi_epi64(round, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + quant = _mm_unpackhi_epi64(quant, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); + qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); + qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); + shift = _mm_unpackhi_epi64(shift, shift); + qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); + store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); + store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob = _mm_max_epi16(eob, eob1); + } + n_coeffs += 8 * 2; + } + + // AC only loop + while (n_coeffs < 0) { + __m128i coeff0, coeff1; + { + __m128i coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i qtmp0, qtmp1; + __m128i cmp_mask0, cmp_mask1; + + coeff0 = load_coefficients(coeff_ptr + n_coeffs); + coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); + + // Poor man's sign extract + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); + qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); + qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); + qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + + // Reinsert signs + qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); + qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); + qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); + qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); + store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); + store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); + } + + { + // Scan for eob + __m128i zero_coeff0, zero_coeff1; + __m128i nzero_coeff0, nzero_coeff1; + __m128i iscan0, iscan1; + __m128i eob0, eob1; + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); + nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); + iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); + eob0 = _mm_and_si128(iscan0, nzero_coeff0); + eob1 = _mm_and_si128(iscan1, nzero_coeff1); + eob0 = _mm_max_epi16(eob0, eob1); + eob = _mm_max_epi16(eob, eob0); + } + n_coeffs += 8 * 2; + } + + // Accumulate EOB + { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + *eob_ptr = _mm_extract_epi16(eob, 1); + } + } else { + do { + store_coefficients(zero, dqcoeff_ptr + n_coeffs); + store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); + store_coefficients(zero, qcoeff_ptr + n_coeffs); + store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); + n_coeffs += 8 * 2; + } while (n_coeffs < 0); + *eob_ptr = 0; + } +} diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm new file mode 100644 index 000000000..36b4dddbd --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -0,0 +1,349 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_1: times 8 dw 1 + +SECTION .text + +; TODO(yunqingwang)fix quantize_b code for skip=1 case. +%macro QUANTIZE_FN 2 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ + shift, qcoeff, dqcoeff, dequant, \ + eob, scan, iscan + cmp dword skipm, 0 + jne .blank + + ; actual quantize loop - setup pointers, rounders, etc. + movifnidn coeffq, coeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, dequantmp + movifnidn zbinq, zbinmp + movifnidn roundq, roundmp + movifnidn quantq, quantmp + mova m0, [zbinq] ; m0 = zbin + mova m1, [roundq] ; m1 = round + mova m2, [quantq] ; m2 = quant +%ifidn %1, b_32x32 + pcmpeqw m5, m5 + psrlw m5, 15 + paddw m0, m5 + paddw m1, m5 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 +%endif + mova m3, [r2q] ; m3 = dequant + psubw m0, [pw_1] + mov r2, shiftmp + mov r3, qcoeffmp + mova m4, [r2] ; m4 = shift + mov r4, dqcoeffmp + mov r5, iscanmp +%ifidn %1, b_32x32 + psllw m4, 1 +%endif + pxor m5, m5 ; m5 = dedicated zero + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob +%if CONFIG_HIGHBITDEPTH + lea coeffq, [ coeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] + lea dqcoeffq, [dqcoeffq+ncoeffq*4] +%else + lea coeffq, [ coeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] + lea dqcoeffq, [dqcoeffq+ncoeffq*2] +%endif + lea iscanq, [ iscanq+ncoeffq*2] + neg ncoeffq + + ; get DC and first 15 AC coeffs +%if CONFIG_HIGHBITDEPTH + ; coeff stored as 32bit numbers & require 16bit numbers + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] +%else + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] +%endif + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + punpckhqdq m0, m0 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin + paddsw m6, m1 ; m6 += round + punpckhqdq m1, m1 + paddsw m11, m1 ; m11 += round + pmulhw m8, m6, m2 ; m8 = m6*q>>16 + punpckhqdq m2, m2 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m8, m6 ; m8 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m8, m4 ; m8 = m8*qsh>>16 + punpckhqdq m4, m4 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m8, m9 ; m8 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m8, m7 + pand m13, m12 +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register +%else + mova [qcoeffq+ncoeffq*2+ 0], m8 + mova [qcoeffq+ncoeffq*2+16], m13 +%endif +%ifidn %1, b_32x32 + pabsw m8, m8 + pabsw m13, m13 +%endif + pmullw m8, m3 ; dqc[i] = qc[i] * q + punpckhqdq m3, m3 + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m8, 1 + psrlw m13, 1 + psignw m8, m9 + psignw m13, m10 +%endif +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m8 + mova m6, m8 + pcmpgtw m5, m8 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register +%else + mova [dqcoeffq+ncoeffq*2+ 0], m8 + mova [dqcoeffq+ncoeffq*2+16], m13 +%endif + pcmpeqw m8, m5 ; m8 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m8, m6 ; m8 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m13 + add ncoeffq, mmsize + jz .accumulate_eob + +.ac_only_loop: +%if CONFIG_HIGHBITDEPTH + ; pack coeff from 32bit to 16bit array + mova m9, [ coeffq+ncoeffq*4+ 0] + packssdw m9, [ coeffq+ncoeffq*4+16] + mova m10, [ coeffq+ncoeffq*4+32] + packssdw m10, [ coeffq+ncoeffq*4+48] +%else + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] +%endif + pabsw m6, m9 ; m6 = abs(m9) + pabsw m11, m10 ; m11 = abs(m10) + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin +%ifidn %1, b_32x32 + pmovmskb r6d, m7 + pmovmskb r2d, m12 + or r6, r2 + jz .skip_iter +%endif + paddsw m6, m1 ; m6 += round + paddsw m11, m1 ; m11 += round + pmulhw m14, m6, m2 ; m14 = m6*q>>16 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 + paddw m14, m6 ; m14 += m6 + paddw m13, m11 ; m13 += m11 + pmulhw m14, m4 ; m14 = m14*qsh>>16 + pmulhw m13, m4 ; m13 = m13*qsh>>16 + psignw m14, m9 ; m14 = reinsert sign + psignw m13, m10 ; m13 = reinsert sign + pand m14, m7 + pand m13, m12 +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + pxor m11, m11 + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+ 0], m11 + mova [qcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [qcoeffq+ncoeffq*4+32], m11 + mova [qcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 ; reset m5 to zero register +%else + mova [qcoeffq+ncoeffq*2+ 0], m14 + mova [qcoeffq+ncoeffq*2+16], m13 +%endif +%ifidn %1, b_32x32 + pabsw m14, m14 + pabsw m13, m13 +%endif + pmullw m14, m3 ; dqc[i] = qc[i] * q + pmullw m13, m3 ; dqc[i] = qc[i] * q +%ifidn %1, b_32x32 + psrlw m14, 1 + psrlw m13, 1 + psignw m14, m9 + psignw m13, m10 +%endif +%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff + mova m11, m14 + mova m6, m14 + pcmpgtw m5, m14 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+ 0], m11 + mova [dqcoeffq+ncoeffq*4+16], m6 + pxor m5, m5 + mova m11, m13 + mova m6, m13 + pcmpgtw m5, m13 + punpcklwd m11, m5 + punpckhwd m6, m5 + mova [dqcoeffq+ncoeffq*4+32], m11 + mova [dqcoeffq+ncoeffq*4+48], m6 + pxor m5, m5 +%else + mova [dqcoeffq+ncoeffq*2+ 0], m14 + mova [dqcoeffq+ncoeffq*2+16], m13 +%endif + pcmpeqw m14, m5 ; m14 = c[i] == 0 + pcmpeqw m13, m5 ; m13 = c[i] == 0 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] + psubw m6, m7 ; m6 = scan[i] + 1 + psubw m11, m12 ; m11 = scan[i] + 1 + pandn m14, m6 ; m14 = max(eob) + pandn m13, m11 ; m13 = max(eob) + pmaxsw m8, m14 + pmaxsw m8, m13 + add ncoeffq, mmsize + jl .ac_only_loop + +%ifidn %1, b_32x32 + jmp .accumulate_eob +.skip_iter: +%if CONFIG_HIGHBITDEPTH + mova [qcoeffq+ncoeffq*4+ 0], m5 + mova [qcoeffq+ncoeffq*4+16], m5 + mova [qcoeffq+ncoeffq*4+32], m5 + mova [qcoeffq+ncoeffq*4+48], m5 + mova [dqcoeffq+ncoeffq*4+ 0], m5 + mova [dqcoeffq+ncoeffq*4+16], m5 + mova [dqcoeffq+ncoeffq*4+32], m5 + mova [dqcoeffq+ncoeffq*4+48], m5 +%else + mova [qcoeffq+ncoeffq*2+ 0], m5 + mova [qcoeffq+ncoeffq*2+16], m5 + mova [dqcoeffq+ncoeffq*2+ 0], m5 + mova [dqcoeffq+ncoeffq*2+16], m5 +%endif + add ncoeffq, mmsize + jl .ac_only_loop +%endif + +.accumulate_eob: + ; horizontally accumulate/max eobs and write into [eob] memory pointer + mov r2, eobmp + pshufd m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0xe + pmaxsw m8, m7 + pshuflw m7, m8, 0x1 + pmaxsw m8, m7 + pextrw r6, m8, 0 + mov [r2], r6 + RET + + ; skip-block, i.e. just write all zeroes +.blank: + mov r0, dqcoeffmp + movifnidn ncoeffq, ncoeffmp + mov r2, qcoeffmp + mov r3, eobmp + DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob +%if CONFIG_HIGHBITDEPTH + lea dqcoeffq, [dqcoeffq+ncoeffq*4] + lea qcoeffq, [ qcoeffq+ncoeffq*4] +%else + lea dqcoeffq, [dqcoeffq+ncoeffq*2] + lea qcoeffq, [ qcoeffq+ncoeffq*2] +%endif + neg ncoeffq + pxor m7, m7 +.blank_loop: +%if CONFIG_HIGHBITDEPTH + mova [dqcoeffq+ncoeffq*4+ 0], m7 + mova [dqcoeffq+ncoeffq*4+16], m7 + mova [dqcoeffq+ncoeffq*4+32], m7 + mova [dqcoeffq+ncoeffq*4+48], m7 + mova [qcoeffq+ncoeffq*4+ 0], m7 + mova [qcoeffq+ncoeffq*4+16], m7 + mova [qcoeffq+ncoeffq*4+32], m7 + mova [qcoeffq+ncoeffq*4+48], m7 +%else + mova [dqcoeffq+ncoeffq*2+ 0], m7 + mova [dqcoeffq+ncoeffq*2+16], m7 + mova [qcoeffq+ncoeffq*2+ 0], m7 + mova [qcoeffq+ncoeffq*2+16], m7 +%endif + add ncoeffq, mmsize + jl .blank_loop + mov word [eobq], 0 + RET +%endmacro + +INIT_XMM ssse3 +QUANTIZE_FN b, 7 +QUANTIZE_FN b_32x32, 7 diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c new file mode 100644 index 000000000..e60f518b4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include // AVX2 +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 32; i++) { + // load src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)src); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m128i sum; + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } + _mm256_zeroupper(); +} + +void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; + __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; + __m256i ref3_reg, ref3next_reg; + __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; + __m256i sum_mlow, sum_mhigh; + int i; + const uint8_t *ref0, *ref1, *ref2, *ref3; + + ref0 = ref[0]; + ref1 = ref[1]; + ref2 = ref[2]; + ref3 = ref[3]; + sum_ref0 = _mm256_set1_epi16(0); + sum_ref1 = _mm256_set1_epi16(0); + sum_ref2 = _mm256_set1_epi16(0); + sum_ref3 = _mm256_set1_epi16(0); + for (i = 0; i < 64; i++) { + // load 64 bytes from src and all refs + src_reg = _mm256_loadu_si256((const __m256i *)src); + srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32)); + ref0_reg = _mm256_loadu_si256((const __m256i *)ref0); + ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32)); + ref1_reg = _mm256_loadu_si256((const __m256i *)ref1); + ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32)); + ref2_reg = _mm256_loadu_si256((const __m256i *)ref2); + ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32)); + ref3_reg = _mm256_loadu_si256((const __m256i *)ref3); + ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32)); + // sum of the absolute differences between every ref-i to src + ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); + ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); + ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); + ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); + ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); + ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); + ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); + ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); + + // sum every ref-i + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); + sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); + sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); + sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); + sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); + src += src_stride; + ref0 += ref_stride; + ref1 += ref_stride; + ref2 += ref_stride; + ref3 += ref_stride; + } + { + __m128i sum; + + // in sum_ref-i the result is saved in the first 4 bytes + // the other 4 bytes are zeroed. + // sum_ref1 and sum_ref3 are shifted left by 4 bytes + sum_ref1 = _mm256_slli_si256(sum_ref1, 4); + sum_ref3 = _mm256_slli_si256(sum_ref3, 4); + + // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 + sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); + sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); + + // merge every 64 bit from each sum_ref-i + sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); + sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); + + // add the low 64 bit to the high 64 bit + sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); + + // add the low 128 bit to the high 128 bit + sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), + _mm256_extractf128_si256(sum_mlow, 1)); + + _mm_storeu_si128((__m128i *)(res), sum); + } + _mm256_zeroupper(); +} + +void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + const uint8_t *rf[4]; + uint32_t sum0[4]; + uint32_t sum1[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0); + src += src_stride << 5; + rf[0] += ref_stride << 5; + rf[1] += ref_stride << 5; + rf[2] += ref_stride << 5; + rf[3] += ref_stride << 5; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1); + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + res[2] = sum0[2] + sum1[2]; + res[3] = sum0[3] + sum1[3]; +} + +void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + const uint8_t *rf[4]; + uint32_t sum0[4]; + uint32_t sum1[4]; + unsigned int half_width = 32; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0); + src += half_width; + rf[0] += half_width; + rf[1] += half_width; + rf[2] += half_width; + rf[3] += half_width; + aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1); + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + res[2] = sum0[2] + sum1[2]; + res[3] = sum0[3] + sum1[3]; +} diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm new file mode 100644 index 000000000..8f04ef2f3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm @@ -0,0 +1,253 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_4x2x4 5-6 0 + movd m0, [srcq +%2] +%if %1 == 1 + movd m6, [ref1q+%3] + movd m4, [ref2q+%3] + movd m7, [ref3q+%3] + movd m5, [ref4q+%3] + movd m1, [srcq +%4] + movd m2, [ref1q+%5] + punpckldq m0, m1 + punpckldq m6, m2 + movd m1, [ref2q+%5] + movd m2, [ref3q+%5] + movd m3, [ref4q+%5] + punpckldq m4, m1 + punpckldq m7, m2 + punpckldq m5, m3 + movlhps m0, m0 + movlhps m6, m4 + movlhps m7, m5 + psadbw m6, m0 + psadbw m7, m0 +%else + movd m1, [ref1q+%3] + movd m5, [ref1q+%5] + movd m2, [ref2q+%3] + movd m4, [ref2q+%5] + punpckldq m1, m5 + punpckldq m2, m4 + movd m3, [ref3q+%3] + movd m5, [ref3q+%5] + punpckldq m3, m5 + movd m4, [ref4q+%3] + movd m5, [ref4q+%5] + punpckldq m4, m5 + movd m5, [srcq +%4] + punpckldq m0, m5 + movlhps m0, m0 + movlhps m1, m2 + movlhps m3, m4 + psadbw m1, m0 + psadbw m3, m0 + paddd m6, m1 + paddd m7, m3 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_8x2x4 5-6 0 + movh m0, [srcq +%2] +%if %1 == 1 + movh m4, [ref1q+%3] + movh m5, [ref2q+%3] + movh m6, [ref3q+%3] + movh m7, [ref4q+%3] + movhps m0, [srcq +%4] + movhps m4, [ref1q+%5] + movhps m5, [ref2q+%5] + movhps m6, [ref3q+%5] + movhps m7, [ref4q+%5] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movh m1, [ref1q+%3] + movh m2, [ref2q+%3] + movh m3, [ref3q+%3] + movhps m0, [srcq +%4] + movhps m1, [ref1q+%5] + movhps m2, [ref2q+%5] + movhps m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movh m1, [ref4q+%3] + movhps m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif +%endmacro + +; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_16x2x4 5-6 0 + ; 1st 16 px + mova m0, [srcq +%2] +%if %1 == 1 + movu m4, [ref1q+%3] + movu m5, [ref2q+%3] + movu m6, [ref3q+%3] + movu m7, [ref4q+%3] + psadbw m4, m0 + psadbw m5, m0 + psadbw m6, m0 + psadbw m7, m0 +%else + movu m1, [ref1q+%3] + movu m2, [ref2q+%3] + movu m3, [ref3q+%3] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%3] + paddd m5, m2 + paddd m6, m3 + psadbw m1, m0 + paddd m7, m1 +%endif + + ; 2nd 16 px + mova m0, [srcq +%4] + movu m1, [ref1q+%5] + movu m2, [ref2q+%5] + movu m3, [ref3q+%5] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + paddd m4, m1 + movu m1, [ref4q+%5] + paddd m5, m2 + paddd m6, m3 +%if %6 == 1 + lea srcq, [srcq +src_strideq*2] + lea ref1q, [ref1q+ref_strideq*2] + lea ref2q, [ref2q+ref_strideq*2] + lea ref3q, [ref3q+ref_strideq*2] + lea ref4q, [ref4q+ref_strideq*2] +%endif + psadbw m1, m0 + paddd m7, m1 +%endmacro + +; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_32x2x4 5-6 0 + PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 + PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 +%endmacro + +; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_64x2x4 5-6 0 + PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 + PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 +%endmacro + +; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end +%macro PROCESS_128x2x4 5-6 0 + PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64 + PROCESS_64x2x4 0, %4, %5, %4 + 64, %5 + 64, %6 +%endmacro + +; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride, +; uint8_t *ref[4], int ref_stride, +; uint32_t res[4]); +; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4 +%macro SADNXN4D 2 +%if UNIX64 +cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ + res, ref2, ref3, ref4 +%else +cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ + ref2, ref3, ref4 +%endif + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided + mov ref2q, [ref1q+gprsize*1] + mov ref3q, [ref1q+gprsize*2] + mov ref4q, [ref1q+gprsize*3] + mov ref1q, [ref1q+gprsize*0] + + PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 +%rep (%2-4)/2 + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 +%endrep + PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 + +%if %1 > 4 + pslldq m5, 4 + pslldq m7, 4 + por m4, m5 + por m6, m7 + mova m5, m4 + mova m7, m6 + punpcklqdq m4, m6 + punpckhqdq m5, m7 + movifnidn r4, r4mp + paddd m4, m5 + movu [r4], m4 + RET +%else + movifnidn r4, r4mp + pshufd m6, m6, 0x08 + pshufd m7, m7, 0x08 + movq [r4+0], m6 + movq [r4+8], m7 + RET +%endif +%endmacro + +INIT_XMM sse2 +%if CONFIG_EXT_PARTITION +SADNXN4D 128, 128 +SADNXN4D 128, 64 +SADNXN4D 64, 128 +%endif +SADNXN4D 64, 64 +SADNXN4D 64, 32 +SADNXN4D 32, 64 +SADNXN4D 32, 32 +SADNXN4D 32, 16 +SADNXN4D 16, 32 +SADNXN4D 16, 16 +SADNXN4D 16, 8 +SADNXN4D 8, 16 +SADNXN4D 8, 8 +SADNXN4D 8, 4 +SADNXN4D 4, 8 +SADNXN4D 4, 4 diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c new file mode 100644 index 000000000..efba61289 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_avx2.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +#define FSAD64_H(h) \ + unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSAD32_H(h) \ + unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSAD64 \ + FSAD64_H(64); \ + FSAD64_H(32); + +#define FSAD32 \ + FSAD32_H(64); \ + FSAD32_H(32); \ + FSAD32_H(16); + +/* clang-format off */ +FSAD64 +FSAD32 +/* clang-format on */ + +#undef FSAD64 +#undef FSAD32 +#undef FSAD64_H +#undef FSAD32_H + +#define FSADAVG64_H(h) \ + unsigned int aom_sad64x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + for (i = 0; i < h; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref_stride; \ + src_ptr += src_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSADAVG32_H(h) \ + unsigned int aom_sad32x##h##_avg_avx2( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, const uint8_t *second_pred) { \ + int i, res; \ + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ + __m256i sum_sad = _mm256_setzero_si256(); \ + __m256i sum_sad_h; \ + __m128i sum_sad128; \ + int ref2_stride = ref_stride << 1; \ + int src2_stride = src_stride << 1; \ + int max = h >> 1; \ + for (i = 0; i < max; i++) { \ + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ + ref1_reg = _mm256_avg_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \ + ref2_reg = _mm256_avg_epu8( \ + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \ + sad1_reg = _mm256_sad_epu8( \ + ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \ + sad2_reg = _mm256_sad_epu8( \ + ref2_reg, \ + _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ + sum_sad = \ + _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ + ref_ptr += ref2_stride; \ + src_ptr += src2_stride; \ + second_pred += 64; \ + } \ + sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ + res = _mm_cvtsi128_si32(sum_sad128); \ + _mm256_zeroupper(); \ + return res; \ + } + +#define FSADAVG64 \ + FSADAVG64_H(64); \ + FSADAVG64_H(32); + +#define FSADAVG32 \ + FSADAVG32_H(64); \ + FSADAVG32_H(32); \ + FSADAVG32_H(16); + +/* clang-format off */ +FSADAVG64 +FSADAVG32 +/* clang-format on */ + +#undef FSADAVG64 +#undef FSADAVG32 +#undef FSADAVG64_H +#undef FSADAVG32_H diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c new file mode 100644 index 000000000..196394379 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c @@ -0,0 +1,1043 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" + +// SAD +static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) { + // input 8 32-bit summation + __m128i lo128, hi128; + __m256i u = _mm256_srli_si256(*v, 8); + u = _mm256_add_epi32(u, *v); + + // 4 32-bit summation + hi128 = _mm256_extracti128_si256(u, 1); + lo128 = _mm256_castsi256_si128(u); + lo128 = _mm_add_epi32(hi128, lo128); + + // 2 32-bit summation + hi128 = _mm_srli_si128(lo128, 4); + lo128 = _mm_add_epi32(lo128, hi128); + + return (unsigned int)_mm_cvtsi128_si32(lo128); +} + +unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + + // first 4 rows + __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + __m256i u0 = _mm256_sub_epi16(s0, r0); + __m256i u1 = _mm256_sub_epi16(s1, r1); + __m256i u2 = _mm256_sub_epi16(s2, r2); + __m256i u3 = _mm256_sub_epi16(s3, r3); + __m256i zero = _mm256_setzero_si256(); + __m256i sum0, sum1; + + u0 = _mm256_abs_epi16(u0); + u1 = _mm256_abs_epi16(u1); + u2 = _mm256_abs_epi16(u2); + u3 = _mm256_abs_epi16(u3); + + sum0 = _mm256_add_epi16(u0, u1); + sum0 = _mm256_add_epi16(sum0, u2); + sum0 = _mm256_add_epi16(sum0, u3); + + // second 4 rows + src_ptr += src_stride << 2; + ref_ptr += ref_stride << 2; + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + u0 = _mm256_sub_epi16(s0, r0); + u1 = _mm256_sub_epi16(s1, r1); + u2 = _mm256_sub_epi16(s2, r2); + u3 = _mm256_sub_epi16(s3, r3); + + u0 = _mm256_abs_epi16(u0); + u1 = _mm256_abs_epi16(u1); + u2 = _mm256_abs_epi16(u2); + u3 = _mm256_abs_epi16(u3); + + sum1 = _mm256_add_epi16(u0, u1); + sum1 = _mm256_add_epi16(sum1, u2); + sum1 = _mm256_add_epi16(sum1, u3); + + // find out the SAD + s0 = _mm256_unpacklo_epi16(sum0, zero); + s1 = _mm256_unpackhi_epi16(sum0, zero); + r0 = _mm256_unpacklo_epi16(sum1, zero); + r1 = _mm256_unpackhi_epi16(sum1, zero); + s0 = _mm256_add_epi32(s0, s1); + r0 = _mm256_add_epi32(r0, r1); + sum0 = _mm256_add_epi32(s0, r0); + // 8 32-bit summation + + return (unsigned int)get_sad_from_mm256_epi32(&sum0); +} + +unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); + const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref); + __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3; + __m256i sum0; + __m256i sum = _mm256_setzero_si256(); + const __m256i zero = _mm256_setzero_si256(); + int row = 0; + + // Loop for every 4 rows + while (row < 16) { + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + u0 = _mm256_sub_epi16(s0, r0); + u1 = _mm256_sub_epi16(s1, r1); + u2 = _mm256_sub_epi16(s2, r2); + u3 = _mm256_sub_epi16(s3, r3); + + u0 = _mm256_abs_epi16(u0); + u1 = _mm256_abs_epi16(u1); + u2 = _mm256_abs_epi16(u2); + u3 = _mm256_abs_epi16(u3); + + sum0 = _mm256_add_epi16(u0, u1); + sum0 = _mm256_add_epi16(sum0, u2); + sum0 = _mm256_add_epi16(sum0, u3); + + s0 = _mm256_unpacklo_epi16(sum0, zero); + s1 = _mm256_unpackhi_epi16(sum0, zero); + sum = _mm256_add_epi32(sum, s0); + sum = _mm256_add_epi32(sum, s1); + // 8 32-bit summation + + row += 4; + src_ptr += src_stride << 2; + ref_ptr += ref_stride << 2; + } + return get_sad_from_mm256_epi32(&sum); +} + +static void sad32x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s0, s1, s2, s3, r0, r1, r2, r3; + const __m256i zero = _mm256_setzero_si256(); + int row_sections = 0; + + while (row_sections < 2) { + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); + + if (sec_ptr) { + r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr)); + r1 = _mm256_avg_epu16( + r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r2 = _mm256_avg_epu16( + r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r3 = _mm256_avg_epu16( + r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + } + s0 = _mm256_sub_epi16(s0, r0); + s1 = _mm256_sub_epi16(s1, r1); + s2 = _mm256_sub_epi16(s2, r2); + s3 = _mm256_sub_epi16(s3, r3); + + s0 = _mm256_abs_epi16(s0); + s1 = _mm256_abs_epi16(s1); + s2 = _mm256_abs_epi16(s2); + s3 = _mm256_abs_epi16(s3); + + s0 = _mm256_add_epi16(s0, s1); + s0 = _mm256_add_epi16(s0, s2); + s0 = _mm256_add_epi16(s0, s3); + + r0 = _mm256_unpacklo_epi16(s0, zero); + r1 = _mm256_unpackhi_epi16(s0, zero); + + r0 = _mm256_add_epi32(r0, r1); + *sad_acc = _mm256_add_epi32(*sad_acc, r0); + + row_sections += 1; + src_ptr += src_stride << 1; + ref_ptr += ref_stride << 1; + if (sec_ptr) sec_ptr += 32 << 1; + } +} + +unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 4) { + sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 4; + ref += ref_stride << 4; + sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 4; + ref += ref_stride << 4; + sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 5; + ref += ref_stride << 5; + sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +static void sad64x2(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[8], r[8]; + const __m256i zero = _mm256_setzero_si256(); + + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16)); + s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32)); + s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16)); + r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32)); + r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48)); + + if (sec_ptr) { + r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + r[4] = _mm256_avg_epu16( + r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64))); + r[5] = _mm256_avg_epu16( + r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80))); + r[6] = _mm256_avg_epu16( + r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96))); + r[7] = _mm256_avg_epu16( + r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112))); + } + + s[0] = _mm256_sub_epi16(s[0], r[0]); + s[1] = _mm256_sub_epi16(s[1], r[1]); + s[2] = _mm256_sub_epi16(s[2], r[2]); + s[3] = _mm256_sub_epi16(s[3], r[3]); + s[4] = _mm256_sub_epi16(s[4], r[4]); + s[5] = _mm256_sub_epi16(s[5], r[5]); + s[6] = _mm256_sub_epi16(s[6], r[6]); + s[7] = _mm256_sub_epi16(s[7], r[7]); + + s[0] = _mm256_abs_epi16(s[0]); + s[1] = _mm256_abs_epi16(s[1]); + s[2] = _mm256_abs_epi16(s[2]); + s[3] = _mm256_abs_epi16(s[3]); + s[4] = _mm256_abs_epi16(s[4]); + s[5] = _mm256_abs_epi16(s[5]); + s[6] = _mm256_abs_epi16(s[6]); + s[7] = _mm256_abs_epi16(s[7]); + + s[0] = _mm256_add_epi16(s[0], s[1]); + s[0] = _mm256_add_epi16(s[0], s[2]); + s[0] = _mm256_add_epi16(s[0], s[3]); + + s[4] = _mm256_add_epi16(s[4], s[5]); + s[4] = _mm256_add_epi16(s[4], s[6]); + s[4] = _mm256_add_epi16(s[4], s[7]); + + r[0] = _mm256_unpacklo_epi16(s[0], zero); + r[1] = _mm256_unpackhi_epi16(s[0], zero); + r[2] = _mm256_unpacklo_epi16(s[4], zero); + r[3] = _mm256_unpackhi_epi16(s[4], zero); + + r[0] = _mm256_add_epi32(r[0], r[1]); + r[0] = _mm256_add_epi32(r[0], r[2]); + r[0] = _mm256_add_epi32(r[0], r[3]); + *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); +} + +unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 16) { + sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 5; + ref += ref_stride << 5; + sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +#if CONFIG_EXT_PARTITION +static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s[8], r[8]; + const __m256i zero = _mm256_setzero_si256(); + + s[0] = _mm256_loadu_si256((const __m256i *)src_ptr); + s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16)); + s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32)); + s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48)); + s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64)); + s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80)); + s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96)); + s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112)); + + r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr); + r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16)); + r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32)); + r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48)); + r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64)); + r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80)); + r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96)); + r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112)); + + if (sec_ptr) { + r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr)); + r[1] = _mm256_avg_epu16( + r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r[2] = _mm256_avg_epu16( + r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r[3] = _mm256_avg_epu16( + r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + r[4] = _mm256_avg_epu16( + r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64))); + r[5] = _mm256_avg_epu16( + r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80))); + r[6] = _mm256_avg_epu16( + r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96))); + r[7] = _mm256_avg_epu16( + r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112))); + } + + s[0] = _mm256_sub_epi16(s[0], r[0]); + s[1] = _mm256_sub_epi16(s[1], r[1]); + s[2] = _mm256_sub_epi16(s[2], r[2]); + s[3] = _mm256_sub_epi16(s[3], r[3]); + s[4] = _mm256_sub_epi16(s[4], r[4]); + s[5] = _mm256_sub_epi16(s[5], r[5]); + s[6] = _mm256_sub_epi16(s[6], r[6]); + s[7] = _mm256_sub_epi16(s[7], r[7]); + + s[0] = _mm256_abs_epi16(s[0]); + s[1] = _mm256_abs_epi16(s[1]); + s[2] = _mm256_abs_epi16(s[2]); + s[3] = _mm256_abs_epi16(s[3]); + s[4] = _mm256_abs_epi16(s[4]); + s[5] = _mm256_abs_epi16(s[5]); + s[6] = _mm256_abs_epi16(s[6]); + s[7] = _mm256_abs_epi16(s[7]); + + s[0] = _mm256_add_epi16(s[0], s[1]); + s[0] = _mm256_add_epi16(s[0], s[2]); + s[0] = _mm256_add_epi16(s[0], s[3]); + + s[4] = _mm256_add_epi16(s[4], s[5]); + s[4] = _mm256_add_epi16(s[4], s[6]); + s[4] = _mm256_add_epi16(s[4], s[7]); + + r[0] = _mm256_unpacklo_epi16(s[0], zero); + r[1] = _mm256_unpackhi_epi16(s[0], zero); + r[2] = _mm256_unpacklo_epi16(s[4], zero); + r[3] = _mm256_unpackhi_epi16(s[4], zero); + + r[0] = _mm256_add_epi32(r[0], r[1]); + r[0] = _mm256_add_epi32(r[0], r[2]); + r[0] = _mm256_add_epi32(r[0], r[3]); + *sad_acc = _mm256_add_epi32(*sad_acc, r[0]); +} + +unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + int row = 0; + while (row < 64) { + sad128x1(srcp, refp, NULL, &sad); + srcp += src_stride; + refp += ref_stride; + row += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 6; + ref += ref_stride << 6; + sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride); + return sum; +} + +unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride) { + uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); + src += src_stride << 6; + ref += ref_stride << 6; + sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); + return sum; +} +#endif // CONFIG_EXT_PARTITION + +// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. +static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride, + const uint16_t *ref_ptr, int ref_stride, + const uint16_t *sec_ptr, __m256i *sad_acc) { + __m256i s0, s1, s2, s3, r0, r1, r2, r3; + const __m256i zero = _mm256_setzero_si256(); + + s0 = _mm256_loadu_si256((const __m256i *)src_ptr); + s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride)); + s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride)); + s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride)); + + r0 = _mm256_loadu_si256((const __m256i *)ref_ptr); + r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride)); + r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride)); + r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride)); + + if (sec_ptr) { + r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr)); + r1 = _mm256_avg_epu16(r1, + _mm256_loadu_si256((const __m256i *)(sec_ptr + 16))); + r2 = _mm256_avg_epu16(r2, + _mm256_loadu_si256((const __m256i *)(sec_ptr + 32))); + r3 = _mm256_avg_epu16(r3, + _mm256_loadu_si256((const __m256i *)(sec_ptr + 48))); + } + + s0 = _mm256_sub_epi16(s0, r0); + s1 = _mm256_sub_epi16(s1, r1); + s2 = _mm256_sub_epi16(s2, r2); + s3 = _mm256_sub_epi16(s3, r3); + + s0 = _mm256_abs_epi16(s0); + s1 = _mm256_abs_epi16(s1); + s2 = _mm256_abs_epi16(s2); + s3 = _mm256_abs_epi16(s3); + + s0 = _mm256_add_epi16(s0, s1); + s0 = _mm256_add_epi16(s0, s2); + s0 = _mm256_add_epi16(s0, s3); + + r0 = _mm256_unpacklo_epi16(s0, zero); + r1 = _mm256_unpackhi_epi16(s0, zero); + + r0 = _mm256_add_epi32(r0, r1); + *sad_acc = _mm256_add_epi32(*sad_acc, r0); +} + +unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + + // Next 4 rows + srcp += src_stride << 2; + refp += ref_stride << 2; + secp += 64; + sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad); + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 3; + uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 4; + uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 16 << left_shift; + sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 2; + int row_section = 0; + + while (row_section < 4) { + sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 32 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 4; + uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 32 << left_shift; + sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 32 << left_shift; + sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + const int left_shift = 1; + int row_section = 0; + + while (row_section < 16) { + sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad); + srcp += src_stride << left_shift; + refp += ref_stride << left_shift; + secp += 64 << left_shift; + row_section += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 5; + uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 64 << left_shift; + sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +#if CONFIG_EXT_PARTITION +unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + const int left_shift = 6; + uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 64 << left_shift; + sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} + +unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + __m256i sad = _mm256_setzero_si256(); + uint16_t *srcp = CONVERT_TO_SHORTPTR(src); + uint16_t *refp = CONVERT_TO_SHORTPTR(ref); + uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred); + int row = 0; + while (row < 64) { + sad128x1(srcp, refp, secp, &sad); + srcp += src_stride; + refp += ref_stride; + secp += 16 << 3; + row += 1; + } + return get_sad_from_mm256_epi32(&sad); +} + +unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + const uint8_t *second_pred) { + unsigned int sum; + const int left_shift = 6; + + sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + src += src_stride << left_shift; + ref += ref_stride << left_shift; + second_pred += 128 << left_shift; + sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride, + second_pred); + return sum; +} +#endif // CONFIG_EXT_PARTITION + +// SAD 4D +// Combine 4 __m256i vectors to uint32_t result[4] +static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, + uint32_t *res) { + __m256i u0, u1, u2, u3; + const __m256i mask = _mm256_set1_epi64x(UINT32_MAX); + __m128i sad; + + // 8 32-bit summation + u0 = _mm256_srli_si256(v[0], 4); + u1 = _mm256_srli_si256(v[1], 4); + u2 = _mm256_srli_si256(v[2], 4); + u3 = _mm256_srli_si256(v[3], 4); + + u0 = _mm256_add_epi32(u0, v[0]); + u1 = _mm256_add_epi32(u1, v[1]); + u2 = _mm256_add_epi32(u2, v[2]); + u3 = _mm256_add_epi32(u3, v[3]); + + u0 = _mm256_and_si256(u0, mask); + u1 = _mm256_and_si256(u1, mask); + u2 = _mm256_and_si256(u2, mask); + u3 = _mm256_and_si256(u3, mask); + // 4 32-bit summation, evenly positioned + + u1 = _mm256_slli_si256(u1, 4); + u3 = _mm256_slli_si256(u3, 4); + + u0 = _mm256_or_si256(u0, u1); + u2 = _mm256_or_si256(u2, u3); + // 8 32-bit summation, interleaved + + u1 = _mm256_unpacklo_epi64(u0, u2); + u3 = _mm256_unpackhi_epi64(u0, u2); + + u0 = _mm256_add_epi32(u1, u3); + sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1), + _mm256_castsi256_si128(u0)); + _mm_storeu_si128((__m128i *)res, sad); +} + +static void convert_pointers(const uint8_t *const ref8[], + const uint16_t *ref[]) { + ref[0] = CONVERT_TO_SHORTPTR(ref8[0]); + ref[1] = CONVERT_TO_SHORTPTR(ref8[1]); + ref[2] = CONVERT_TO_SHORTPTR(ref8[2]); + ref[3] = CONVERT_TO_SHORTPTR(ref8[3]); +} + +static void init_sad(__m256i *s) { + s[0] = _mm256_setzero_si256(); + s[1] = _mm256_setzero_si256(); + s[2] = _mm256_setzero_si256(); + s[3] = _mm256_setzero_si256(); +} + +void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_4_rows = 2; + int i; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + srcp += src_stride << shift_for_4_rows; + refp[i] += ref_stride << shift_for_4_rows; + sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first8rows[4]; + uint32_t second8rows[4]; + const uint8_t *ref[4]; + const int shift_for_8_rows = 3; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows); + src += src_stride << shift_for_8_rows; + ref[0] += ref_stride << shift_for_8_rows; + ref[1] += ref_stride << shift_for_8_rows; + ref[2] += ref_stride << shift_for_8_rows; + ref[3] += ref_stride << shift_for_8_rows; + aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows); + sad_array[0] = first8rows[0] + second8rows[0]; + sad_array[1] = first8rows[1] + second8rows[1]; + sad_array[2] = first8rows[2] + second8rows[2]; + sad_array[3] = first8rows[3] + second8rows[3]; +} + +void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 4; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_4_rows = 2; + int i; + int rows_section; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + rows_section = 0; + while (rows_section < 4) { + sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]); + srcp += src_stride << shift_for_4_rows; + refp[i] += ref_stride << shift_for_4_rows; + rows_section++; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 4; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 5; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + const int shift_for_rows = 1; + int i; + int rows_section; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + rows_section = 0; + while (rows_section < 16) { + sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]); + srcp += src_stride << shift_for_rows; + refp[i] += ref_stride << shift_for_rows; + rows_section++; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 5; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +#if CONFIG_EXT_PARTITION +void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 6; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} + +void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + __m256i sad_vec[4]; + const uint16_t *refp[4]; + const uint16_t *keep = CONVERT_TO_SHORTPTR(src); + const uint16_t *srcp; + int i; + int rows_section; + + init_sad(sad_vec); + convert_pointers(ref_array, refp); + + for (i = 0; i < 4; ++i) { + srcp = keep; + rows_section = 0; + while (rows_section < 64) { + sad128x1(srcp, refp[i], NULL, &sad_vec[i]); + srcp += src_stride; + refp[i] += ref_stride; + rows_section++; + } + } + get_4d_sad_from_mm256_epi32(sad_vec, sad_array); +} + +void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref_array[], + int ref_stride, uint32_t *sad_array) { + uint32_t first_half[4]; + uint32_t second_half[4]; + const uint8_t *ref[4]; + const int shift_for_rows = 6; + + ref[0] = ref_array[0]; + ref[1] = ref_array[1]; + ref[2] = ref_array[2]; + ref[3] = ref_array[3]; + + aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half); + src += src_stride << shift_for_rows; + ref[0] += ref_stride << shift_for_rows; + ref[1] += ref_stride << shift_for_rows; + ref[2] += ref_stride << shift_for_rows; + ref[3] += ref_stride << shift_for_rows; + aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half); + sad_array[0] = first_half[0] + second_half[0]; + sad_array[1] = first_half[1] + second_half[1]; + sad_array[2] = first_half[2] + second_half[2]; + sad_array[3] = first_half[3] + second_half[3]; +} +#endif // CONFIG_EXT_PARTITION diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c new file mode 100644 index 000000000..4419c65b2 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" + +static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + __m256i s1, s2, r1, r2; + __m256i sum = _mm256_setzero_si256(); + __m128i sum_i128; + int i; + + for (i = 0; i < 16; ++i) { + r1 = _mm256_loadu_si256((__m256i const *)ref_ptr); + r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); + s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr)); + s2 = _mm256_sad_epu8( + r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); + sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2)); + ref_ptr += ref_stride << 1; + src_ptr += src_stride << 1; + } + + sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8)); + sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1), + _mm256_castsi256_si128(sum)); + return _mm_cvtsi128_si32(sum_i128); +} + +static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + unsigned int half_width = 32; + uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 5; + ref_ptr += ref_stride << 5; + sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + unsigned int half_width = 64; + uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += half_width; + ref_ptr += half_width; + sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride) { + uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride); + return sum; +} + +static void sad64x64x4d(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + __m128i *res) { + uint32_t sum[4]; + aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum); + *res = _mm_loadu_si128((const __m128i *)sum); +} + +void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m128i sum0, sum1; + const uint8_t *rf[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum0); + src += src_stride << 6; + rf[0] += ref_stride << 6; + rf[1] += ref_stride << 6; + rf[2] += ref_stride << 6; + rf[3] += ref_stride << 6; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum1); + sum0 = _mm_add_epi32(sum0, sum1); + _mm_storeu_si128((__m128i *)res, sum0); +} + +void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + __m128i sum0, sum1; + unsigned int half_width = 64; + const uint8_t *rf[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum0); + src += half_width; + rf[0] += half_width; + rf[1] += half_width; + rf[2] += half_width; + rf[3] += half_width; + sad64x64x4d(src, src_stride, rf, ref_stride, &sum1); + sum0 = _mm_add_epi32(sum0, sum1); + _mm_storeu_si128((__m128i *)res, sum0); +} + +void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4]) { + const uint8_t *rf[4]; + uint32_t sum0[4]; + uint32_t sum1[4]; + + rf[0] = ref[0]; + rf[1] = ref[1]; + rf[2] = ref[2]; + rf[3] = ref[3]; + aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0); + src += src_stride << 6; + rf[0] += ref_stride << 6; + rf[1] += ref_stride << 6; + rf[2] += ref_stride << 6; + rf[3] += ref_stride << 6; + aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1); + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + res[2] = sum0[2] + sum1[2]; + res[3] = sum0[3] + sum1[3]; +} + +static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const int h, const uint8_t *second_pred, + const int second_pred_stride) { + int i, res; + __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; + __m256i sum_sad = _mm256_setzero_si256(); + __m256i sum_sad_h; + __m128i sum_sad128; + for (i = 0; i < h; i++) { + ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); + ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); + ref1_reg = _mm256_avg_epu8( + ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); + ref2_reg = _mm256_avg_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); + sad1_reg = + _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); + sad2_reg = _mm256_sad_epu8( + ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); + sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); + ref_ptr += ref_stride; + src_ptr += src_stride; + second_pred += second_pred_stride; + } + sum_sad_h = _mm256_srli_si256(sum_sad, 8); + sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); + sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); + sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); + res = _mm_cvtsi128_si32(sum_sad128); + + return res; +} + +unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 64); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + second_pred += 64 << 6; + sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 64); + return sum; +} + +unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + unsigned int half_width = 64; + uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 128); + src_ptr += half_width; + ref_ptr += half_width; + second_pred += half_width; + sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, + second_pred, 128); + return sum; +} + +unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred) { + uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, + ref_stride, second_pred); + src_ptr += src_stride << 6; + ref_ptr += ref_stride << 6; + second_pred += 128 << 6; + sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, + second_pred); + return sum; +} diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm new file mode 100644 index 000000000..e45457a57 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm @@ -0,0 +1,345 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +%macro SAD_FN 4 +%if %4 == 0 +%if %3 == 5 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows +%else ; %3 == 7 +cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ + src_stride3, ref_stride3, n_rows +%endif ; %3 == 5/7 +%else ; avg +%if %3 == 5 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ + second_pred, n_rows +%else ; %3 == 7 +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ + ref, ref_stride, \ + second_pred, \ + src_stride3, ref_stride3 +%if ARCH_X86_64 +%define n_rowsd r7d +%else ; x86-32 +%define n_rowsd dword r0m +%endif ; x86-32/64 +%endif ; %3 == 5/7 +%endif ; avg/sad + movsxdifnidn src_strideq, src_strided + movsxdifnidn ref_strideq, ref_strided +%if %3 == 7 + lea src_stride3q, [src_strideq*3] + lea ref_stride3q, [ref_strideq*3] +%endif ; %3 == 7 +%endmacro + +%if CONFIG_EXT_PARTITION +; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD128XN 1-2 0 + SAD_FN 128, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + + movu m1, [refq+64] + movu m2, [refq+80] + movu m3, [refq+96] + movu m4, [refq+112] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*4] + pavgb m2, [second_predq+mmsize*5] + pavgb m3, [second_predq+mmsize*6] + pavgb m4, [second_predq+mmsize*7] + lea second_predq, [second_predq+mmsize*8] +%endif + psadbw m1, [srcq+64] + psadbw m2, [srcq+80] + psadbw m3, [srcq+96] + psadbw m4, [srcq+112] + + add refq, ref_strideq + add srcq, src_strideq + + paddd m1, m2 + paddd m3, m4 + paddd m0, m1 + paddd m0, m3 + + sub n_rowsd, 1 + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD128XN 128 ; sad128x128_sse2 +SAD128XN 128, 1 ; sad128x128_avg_sse2 +SAD128XN 64 ; sad128x64_sse2 +SAD128XN 64, 1 ; sad128x64_avg_sse2 +%endif + + +; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD64XN 1-2 0 + SAD_FN 64, %1, 5, %2 + mov n_rowsd, %1 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+32] + movu m4, [refq+48] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+32] + psadbw m4, [srcq+48] + paddd m1, m2 + paddd m3, m4 + add refq, ref_strideq + paddd m0, m1 + add srcq, src_strideq + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +%if CONFIG_EXT_PARTITION +SAD64XN 128 ; sad64x128_sse2 +SAD64XN 128, 1 ; sad64x128_avg_sse2 +%endif +SAD64XN 64 ; sad64x64_sse2 +SAD64XN 32 ; sad64x32_sse2 +SAD64XN 64, 1 ; sad64x64_avg_sse2 +SAD64XN 32, 1 ; sad64x32_avg_sse2 + +; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD32XN 1-2 0 + SAD_FN 32, %1, 5, %2 + mov n_rowsd, %1/2 + pxor m0, m0 +.loop: + movu m1, [refq] + movu m2, [refq+16] + movu m3, [refq+ref_strideq] + movu m4, [refq+ref_strideq+16] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+16] + psadbw m3, [srcq+src_strideq] + psadbw m4, [srcq+src_strideq+16] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*2] + paddd m0, m1 + lea srcq, [srcq+src_strideq*2] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD32XN 64 ; sad32x64_sse2 +SAD32XN 32 ; sad32x32_sse2 +SAD32XN 16 ; sad32x16_sse2 +SAD32XN 64, 1 ; sad32x64_avg_sse2 +SAD32XN 32, 1 ; sad32x32_avg_sse2 +SAD32XN 16, 1 ; sad32x16_avg_sse2 + +; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD16XN 1-2 0 + SAD_FN 16, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movu m1, [refq] + movu m2, [refq+ref_strideq] + movu m3, [refq+ref_strideq*2] + movu m4, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + pavgb m3, [second_predq+mmsize*2] + pavgb m4, [second_predq+mmsize*3] + lea second_predq, [second_predq+mmsize*4] +%endif + psadbw m1, [srcq] + psadbw m2, [srcq+src_strideq] + psadbw m3, [srcq+src_strideq*2] + psadbw m4, [srcq+src_stride3q] + paddd m1, m2 + paddd m3, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m3 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD16XN 32 ; sad16x32_sse2 +SAD16XN 16 ; sad16x16_sse2 +SAD16XN 8 ; sad16x8_sse2 +SAD16XN 32, 1 ; sad16x32_avg_sse2 +SAD16XN 16, 1 ; sad16x16_avg_sse2 +SAD16XN 8, 1 ; sad16x8_avg_sse2 + +; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD8XN 1-2 0 + SAD_FN 8, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movh m1, [refq] + movhps m1, [refq+ref_strideq] + movh m2, [refq+ref_strideq*2] + movhps m2, [refq+ref_stride3q] +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + pavgb m2, [second_predq+mmsize*1] + lea second_predq, [second_predq+mmsize*2] +%endif + movh m3, [srcq] + movhps m3, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movhps m4, [srcq+src_stride3q] + psadbw m1, m3 + psadbw m2, m4 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + paddd m0, m2 + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD8XN 16 ; sad8x16_sse2 +SAD8XN 8 ; sad8x8_sse2 +SAD8XN 4 ; sad8x4_sse2 +SAD8XN 16, 1 ; sad8x16_avg_sse2 +SAD8XN 8, 1 ; sad8x8_avg_sse2 +SAD8XN 4, 1 ; sad8x4_avg_sse2 + +; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, +; uint8_t *ref, int ref_stride); +%macro SAD4XN 1-2 0 + SAD_FN 4, %1, 7, %2 + mov n_rowsd, %1/4 + pxor m0, m0 + +.loop: + movd m1, [refq] + movd m2, [refq+ref_strideq] + movd m3, [refq+ref_strideq*2] + movd m4, [refq+ref_stride3q] + punpckldq m1, m2 + punpckldq m3, m4 + movlhps m1, m3 +%if %2 == 1 + pavgb m1, [second_predq+mmsize*0] + lea second_predq, [second_predq+mmsize*1] +%endif + movd m2, [srcq] + movd m5, [srcq+src_strideq] + movd m4, [srcq+src_strideq*2] + movd m3, [srcq+src_stride3q] + punpckldq m2, m5 + punpckldq m4, m3 + movlhps m2, m4 + psadbw m1, m2 + lea refq, [refq+ref_strideq*4] + paddd m0, m1 + lea srcq, [srcq+src_strideq*4] + dec n_rowsd + jg .loop + + movhlps m1, m0 + paddd m0, m1 + movd eax, m0 + RET +%endmacro + +INIT_XMM sse2 +SAD4XN 8 ; sad4x8_sse +SAD4XN 4 ; sad4x4_sse +SAD4XN 8, 1 ; sad4x8_avg_sse +SAD4XN 4, 1 ; sad4x4_avg_sse diff --git a/third_party/aom/aom_dsp/x86/sad_sse3.asm b/third_party/aom/aom_dsp/x86/sad_sse3.asm new file mode 100644 index 000000000..f6c27c855 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_sse3.asm @@ -0,0 +1,377 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +%macro STACK_FRAME_CREATE_X3 0 +%if ABI_IS_32BIT + %define src_ptr rsi + %define src_stride rax + %define ref_ptr rdi + %define ref_stride rdx + %define end_ptr rcx + %define ret_var rbx + %define result_ptr arg(4) + %define height dword ptr arg(4) + push rbp + mov rbp, rsp + push rsi + push rdi + push rbx + + mov rsi, arg(0) ; src_ptr + mov rdi, arg(2) ; ref_ptr + + movsxd rax, dword ptr arg(1) ; src_stride + movsxd rdx, dword ptr arg(3) ; ref_stride +%else + %if LIBAOM_YASM_WIN64 + SAVE_XMM 7, u + %define src_ptr rcx + %define src_stride rdx + %define ref_ptr r8 + %define ref_stride r9 + %define end_ptr r10 + %define ret_var r11 + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define height dword ptr [rsp+xmm_stack_space+8+4*8] + %else + %define src_ptr rdi + %define src_stride rsi + %define ref_ptr rdx + %define ref_stride rcx + %define end_ptr r9 + %define ret_var r10 + %define result_ptr r8 + %define height r8 + %endif +%endif + +%endmacro + +%macro STACK_FRAME_DESTROY_X3 0 + %define src_ptr + %define src_stride + %define ref_ptr + %define ref_stride + %define end_ptr + %define ret_var + %define result_ptr + %define height + +%if ABI_IS_32BIT + pop rbx + pop rdi + pop rsi + pop rbp +%else + %if LIBAOM_YASM_WIN64 + RESTORE_XMM + %endif +%endif + ret +%endmacro + +%macro PROCESS_16X2X3 5 +%if %1==0 + movdqa xmm0, XMMWORD PTR [%2] + lddqu xmm5, XMMWORD PTR [%3] + lddqu xmm6, XMMWORD PTR [%3+1] + lddqu xmm7, XMMWORD PTR [%3+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [%2] + lddqu xmm1, XMMWORD PTR [%3] + lddqu xmm2, XMMWORD PTR [%3+1] + lddqu xmm3, XMMWORD PTR [%3+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [%2+%4] + lddqu xmm1, XMMWORD PTR [%3+%5] + lddqu xmm2, XMMWORD PTR [%3+%5+1] + lddqu xmm3, XMMWORD PTR [%3+%5+2] + +%if %1==0 || %1==1 + lea %2, [%2+%4*2] + lea %3, [%3+%5*2] +%endif + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_8X2X3 5 +%if %1==0 + movq mm0, QWORD PTR [%2] + movq mm5, QWORD PTR [%3] + movq mm6, QWORD PTR [%3+1] + movq mm7, QWORD PTR [%3+2] + + psadbw mm5, mm0 + psadbw mm6, mm0 + psadbw mm7, mm0 +%else + movq mm0, QWORD PTR [%2] + movq mm1, QWORD PTR [%3] + movq mm2, QWORD PTR [%3+1] + movq mm3, QWORD PTR [%3+2] + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endif + movq mm0, QWORD PTR [%2+%4] + movq mm1, QWORD PTR [%3+%5] + movq mm2, QWORD PTR [%3+%5+1] + movq mm3, QWORD PTR [%3+%5+2] + +%if %1==0 || %1==1 + lea %2, [%2+%4*2] + lea %3, [%3+%5*2] +%endif + + psadbw mm1, mm0 + psadbw mm2, mm0 + psadbw mm3, mm0 + + paddw mm5, mm1 + paddw mm6, mm2 + paddw mm7, mm3 +%endmacro + +;void int aom_sad16x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad16x16x3_sse3) PRIVATE +sym(aom_sad16x16x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rcx], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rcx+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rcx+8], xmm0 + + STACK_FRAME_DESTROY_X3 + +;void int aom_sad16x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad16x8x3_sse3) PRIVATE +sym(aom_sad16x8x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rcx], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rcx+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rcx+8], xmm0 + + STACK_FRAME_DESTROY_X3 + +;void int aom_sad8x16x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad8x16x3_sse3) PRIVATE +sym(aom_sad8x16x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + punpckldq mm5, mm6 + + movq [rcx], mm5 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 + +;void int aom_sad8x8x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad8x8x3_sse3) PRIVATE +sym(aom_sad8x8x3_sse3): + + STACK_FRAME_CREATE_X3 + + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride + + mov rcx, result_ptr + + punpckldq mm5, mm6 + + movq [rcx], mm5 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 + +;void int aom_sad4x4x3_sse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad4x4x3_sse3) PRIVATE +sym(aom_sad4x4x3_sse3): + + STACK_FRAME_CREATE_X3 + + movd mm0, DWORD PTR [src_ptr] + movd mm1, DWORD PTR [ref_ptr] + + movd mm2, DWORD PTR [src_ptr+src_stride] + movd mm3, DWORD PTR [ref_ptr+ref_stride] + + punpcklbw mm0, mm2 + punpcklbw mm1, mm3 + + movd mm4, DWORD PTR [ref_ptr+1] + movd mm5, DWORD PTR [ref_ptr+2] + + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] + movd mm3, DWORD PTR [ref_ptr+ref_stride+2] + + psadbw mm1, mm0 + + punpcklbw mm4, mm2 + punpcklbw mm5, mm3 + + psadbw mm4, mm0 + psadbw mm5, mm0 + + lea src_ptr, [src_ptr+src_stride*2] + lea ref_ptr, [ref_ptr+ref_stride*2] + + movd mm0, DWORD PTR [src_ptr] + movd mm2, DWORD PTR [ref_ptr] + + movd mm3, DWORD PTR [src_ptr+src_stride] + movd mm6, DWORD PTR [ref_ptr+ref_stride] + + punpcklbw mm0, mm3 + punpcklbw mm2, mm6 + + movd mm3, DWORD PTR [ref_ptr+1] + movd mm7, DWORD PTR [ref_ptr+2] + + psadbw mm2, mm0 + + paddw mm1, mm2 + + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] + movd mm6, DWORD PTR [ref_ptr+ref_stride+2] + + punpcklbw mm3, mm2 + punpcklbw mm7, mm6 + + psadbw mm3, mm0 + psadbw mm7, mm0 + + paddw mm3, mm4 + paddw mm7, mm5 + + mov rcx, result_ptr + + punpckldq mm1, mm3 + + movq [rcx], mm1 + movd [rcx+8], mm7 + + STACK_FRAME_DESTROY_X3 diff --git a/third_party/aom/aom_dsp/x86/sad_sse4.asm b/third_party/aom/aom_dsp/x86/sad_sse4.asm new file mode 100644 index 000000000..5e9c75845 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_sse4.asm @@ -0,0 +1,362 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +%macro PROCESS_16X2X8 1 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm1, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm1, xmm2 + paddw xmm1, xmm3 + paddw xmm1, xmm4 +%else + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endif + movdqa xmm0, XMMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + movq xmm2, MMWORD PTR [rdi+ rdx+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_8X2X8 1 +%if %1 + movq xmm0, MMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm1, xmm2 +%else + movq xmm0, MMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endif + movq xmm0, MMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_4X2X8 1 +%if %1 + movd xmm0, [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + mpsadbw xmm1, xmm0, 0x0 +%else + movd xmm0, [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endif + movd xmm0, [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endmacro + +%macro WRITE_AS_INTS 0 + mov rdi, arg(4) ;Results + pxor xmm0, xmm0 + movdqa xmm2, xmm1 + punpcklwd xmm1, xmm0 + punpckhwd xmm2, xmm0 + + movdqa [rdi], xmm1 + movdqa [rdi + 16], xmm2 +%endmacro + +;void aom_sad16x16x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array); +global sym(aom_sad16x16x8_sse4_1) PRIVATE +sym(aom_sad16x16x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_sad16x8x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(aom_sad16x8x8_sse4_1) PRIVATE +sym(aom_sad16x8x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_sad8x8x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(aom_sad8x8x8_sse4_1) PRIVATE +sym(aom_sad8x8x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_sad8x16x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(aom_sad8x16x8_sse4_1) PRIVATE +sym(aom_sad8x16x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void aom_sad4x4x8_sse4_1( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(aom_sad4x4x8_sse4_1) PRIVATE +sym(aom_sad4x4x8_sse4_1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_4X2X8 1 + PROCESS_4X2X8 0 + + WRITE_AS_INTS + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + + diff --git a/third_party/aom/aom_dsp/x86/sad_ssse3.asm b/third_party/aom/aom_dsp/x86/sad_ssse3.asm new file mode 100644 index 000000000..96b64b040 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sad_ssse3.asm @@ -0,0 +1,373 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + +%include "aom_ports/x86_abi_support.asm" + +%macro PROCESS_16X2X3 1 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm5, XMMWORD PTR [rdi] + lddqu xmm6, XMMWORD PTR [rdi+1] + lddqu xmm7, XMMWORD PTR [rdi+2] + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [rsi] + lddqu xmm1, XMMWORD PTR [rdi] + lddqu xmm2, XMMWORD PTR [rdi+1] + lddqu xmm3, XMMWORD PTR [rdi+2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [rsi+rax] + lddqu xmm1, XMMWORD PTR [rdi+rdx] + lddqu xmm2, XMMWORD PTR [rdi+rdx+1] + lddqu xmm3, XMMWORD PTR [rdi+rdx+2] + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X2X3_OFFSET 2 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm4, XMMWORD PTR [rdi] + movdqa xmm7, XMMWORD PTR [rdi+16] + + movdqa xmm5, xmm7 + palignr xmm5, xmm4, %2 + + movdqa xmm6, xmm7 + palignr xmm6, xmm4, (%2+1) + + palignr xmm7, xmm4, (%2+2) + + psadbw xmm5, xmm0 + psadbw xmm6, xmm0 + psadbw xmm7, xmm0 +%else + movdqa xmm0, XMMWORD PTR [rsi] + movdqa xmm4, XMMWORD PTR [rdi] + movdqa xmm3, XMMWORD PTR [rdi+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endif + movdqa xmm0, XMMWORD PTR [rsi+rax] + movdqa xmm4, XMMWORD PTR [rdi+rdx] + movdqa xmm3, XMMWORD PTR [rdi+rdx+16] + + movdqa xmm1, xmm3 + palignr xmm1, xmm4, %2 + + movdqa xmm2, xmm3 + palignr xmm2, xmm4, (%2+1) + + palignr xmm3, xmm4, (%2+2) + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + psadbw xmm1, xmm0 + psadbw xmm2, xmm0 + psadbw xmm3, xmm0 + + paddw xmm5, xmm1 + paddw xmm6, xmm2 + paddw xmm7, xmm3 +%endmacro + +%macro PROCESS_16X16X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +%macro PROCESS_16X8X3_OFFSET 2 +%2_aligned_by_%1: + + sub rdi, %1 + + PROCESS_16X2X3_OFFSET 1, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + PROCESS_16X2X3_OFFSET 0, %1 + + jmp %2_store_off + +%endmacro + +;void int aom_sad16x16x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad16x16x3_ssse3) PRIVATE +sym(aom_sad16x16x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp .aom_sad16x16x3_ssse3_skiptable +.aom_sad16x16x3_ssse3_jumptable: + dd .aom_sad16x16x3_ssse3_aligned_by_0 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_1 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_2 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_3 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_4 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_5 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_6 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_7 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_8 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_9 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_10 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_11 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_12 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_13 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_14 - .aom_sad16x16x3_ssse3_do_jump + dd .aom_sad16x16x3_ssse3_aligned_by_15 - .aom_sad16x16x3_ssse3_do_jump +.aom_sad16x16x3_ssse3_skiptable: + + call .aom_sad16x16x3_ssse3_do_jump +.aom_sad16x16x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, .aom_sad16x16x3_ssse3_jumptable - .aom_sad16x16x3_ssse3_do_jump + add rax, rcx ; get the absolute address of aom_sad16x16x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X16X3_OFFSET 0, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 1, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 2, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 3, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 4, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 5, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 6, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 7, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 8, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 9, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 10, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 11, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 12, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 13, .aom_sad16x16x3_ssse3 + PROCESS_16X16X3_OFFSET 14, .aom_sad16x16x3_ssse3 + +.aom_sad16x16x3_ssse3_aligned_by_15: + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +.aom_sad16x16x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void int aom_sad16x8x3_ssse3( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; int *results) +global sym(aom_sad16x8x3_ssse3) PRIVATE +sym(aom_sad16x8x3_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + push rsi + push rdi + push rcx + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rdx, 0xf + and rdx, rdi + + jmp .aom_sad16x8x3_ssse3_skiptable +.aom_sad16x8x3_ssse3_jumptable: + dd .aom_sad16x8x3_ssse3_aligned_by_0 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_1 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_2 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_3 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_4 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_5 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_6 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_7 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_8 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_9 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_10 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_11 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_12 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_13 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_14 - .aom_sad16x8x3_ssse3_do_jump + dd .aom_sad16x8x3_ssse3_aligned_by_15 - .aom_sad16x8x3_ssse3_do_jump +.aom_sad16x8x3_ssse3_skiptable: + + call .aom_sad16x8x3_ssse3_do_jump +.aom_sad16x8x3_ssse3_do_jump: + pop rcx ; get the address of do_jump + mov rax, .aom_sad16x8x3_ssse3_jumptable - .aom_sad16x8x3_ssse3_do_jump + add rax, rcx ; get the absolute address of aom_sad16x8x3_ssse3_jumptable + + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable + add rcx, rax + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + jmp rcx + + PROCESS_16X8X3_OFFSET 0, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 1, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 2, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 3, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 4, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 5, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 6, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 7, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 8, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 9, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 10, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 11, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 12, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 13, .aom_sad16x8x3_ssse3 + PROCESS_16X8X3_OFFSET 14, .aom_sad16x8x3_ssse3 + +.aom_sad16x8x3_ssse3_aligned_by_15: + + PROCESS_16X2X3 1 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + PROCESS_16X2X3 0 + +.aom_sad16x8x3_ssse3_store_off: + mov rdi, arg(4) ;Results + + movq xmm0, xmm5 + psrldq xmm5, 8 + + paddw xmm0, xmm5 + movd [rdi], xmm0 +;- + movq xmm0, xmm6 + psrldq xmm6, 8 + + paddw xmm0, xmm6 + movd [rdi+4], xmm0 +;- + movq xmm0, xmm7 + psrldq xmm7, 8 + + paddw xmm0, xmm7 + movd [rdi+8], xmm0 + + ; begin epilog + pop rcx + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm new file mode 100644 index 000000000..aa70106c8 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm @@ -0,0 +1,219 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "aom_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddd xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddd xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddd xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(aom_ssim_parms_16x16_sse2) PRIVATE +sym(aom_ssim_parms_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +.NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse2( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; uint32_t *sum_s, +; uint32_t *sum_r, +; uint32_t *sum_sq_s, +; uint32_t *sum_sq_r, +; uint32_t *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(aom_ssim_parms_8x8_sse2) PRIVATE +sym(aom_ssim_parms_8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +.NextRow: + + ;grab source and reference pixels + movq xmm3, [rsi] + movq xmm4, [rdi] + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz .NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movd [rdi], xmm15; + mov rdi,arg(5) + movd [rdi], xmm14; + mov rdi,arg(6) + movd [rdi], xmm13; + mov rdi,arg(7) + movd [rdi], xmm12; + mov rdi,arg(8) + movd [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm new file mode 100644 index 000000000..d3feb7ec0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm @@ -0,0 +1,1489 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 14 + times 8 dw 2 + times 8 dw 12 + times 8 dw 4 + times 8 dw 10 + times 8 dw 6 + times 16 dw 8 + times 8 dw 6 + times 8 dw 10 + times 8 dw 4 + times 8 dw 12 + times 8 dw 2 + times 8 dw 14 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 14, 2 + times 8 db 12, 4 + times 8 db 10, 6 + times 16 db 8 + times 8 db 6, 10 + times 8 db 4, 12 + times 8 db 2, 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 1 +%if %1 > 4 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%else ; 4xh + pshuflw m4, m6, 0xe + pshuflw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshuflw m4, m6, 0xe + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp +%else + add srcq, src_strideq +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%else +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 + +%ifdef PIC ; 64bit PIC + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if ARCH_X86=1 && CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse, g_bilin_filter, g_pw_8 + %define block_height dword heightm + %define sec_str sec_stridemp + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse, \ + g_bilin_filter, g_pw_8 + %define block_height heightd + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %endif + %else + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse + %if ARCH_X86_64 + %define block_height heightd + %define sec_str sec_strideq + %else + %define block_height dword heightm + %define sec_str sec_stridemp + %endif + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %define block_height heightd + %endif + + %define bilin_filter bilin_filter_m + %endif +%endif + +%if %1 == 4 + %define movx movd +%else + %define movx movh +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar block_height, 1 +%if %2 == 1 ; avg + shl sec_str, 1 +%endif +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [dstq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] +%endif + + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + +%if %2 == 1 ; avg +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET %1 + +.x_zero_y_nonzero: + cmp y_offsetd, 4 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq*2] +%else ; 4xh + movx m1, [srcq+src_strideq*2] + punpckldq m2, m1 +%endif + movx m1, [dstq] +%if %1 > 4 + movlhps m0, m2 +%else ; 4xh + punpckldq m0, m2 +%endif + movx m3, [dstq+dst_strideq] + pavgb m0, m2 + punpcklbw m1, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m4, [secq] + pavgb m0, m4 + punpcklbw m3, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq*2] + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET %1 + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq*2] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonzero: + cmp x_offsetd, 4 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m4, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 + movx m2, [srcq+src_strideq+1] + punpckldq m4, m2 +%endif + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + pavgb m0, m4 + punpcklbw m3, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m1, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] + movx m1, [dstq] + pavgb m0, m4 + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET %1 + +.x_half_y_nonzero: + cmp y_offsetd, 4 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + pavgb m4, m3 + punpckhbw m3, m1, m5 + pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movx m2, [srcq] + movx m3, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + movx m1, [srcq+src_strideq] + punpckldq m2, m1 + movx m1, [srcq+src_strideq+1] + punpckldq m3, m1 +%endif + pavgb m2, m3 +%if %1 > 4 + movlhps m0, m2 + movhlps m4, m2 +%else ; 4xh + punpckldq m0, m2 + pshuflw m4, m2, 0xe +%endif + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq] + movx m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET %1 + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ;x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [dstq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movx m1, [dstq] +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +;y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movx m1, [dstq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 4 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [dstq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [secq] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movx m1, [dstq] + paddw m4, m3 + movx m3, [dstq+dst_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonhalf: +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + + INC_SRC_BY_SRC_STRIDE + + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [dstq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + + INC_SRC_BY_SRC_STRIDE + movx m4, [srcq] + movx m3, [srcq+1] + +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m3, [dstq+dst_strideq] + movx m1, [dstq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movx m3, [dstq+dst_strideq] + paddw m2, m1 + movx m1, [dstq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd +%undef movx + STORE_AND_RET %1 +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_XMM sse2 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm new file mode 100644 index 000000000..7bd5b23ad --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm @@ -0,0 +1,150 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void aom_subtract_block(int rows, int cols, +; int16_t *diff, ptrdiff_t diff_stride, +; const uint8_t *src, ptrdiff_t src_stride, +; const uint8_t *pred, ptrdiff_t pred_stride) + +INIT_XMM sse2 +cglobal subtract_block, 7, 7, 8, \ + rows, cols, diff, diff_stride, src, src_stride, \ + pred, pred_stride +%define pred_str colsq + pxor m7, m7 ; dedicated zero register + cmp colsd, 4 + je .case_4 + cmp colsd, 8 + je .case_8 + cmp colsd, 16 + je .case_16 + cmp colsd, 32 + je .case_32 +%if CONFIG_EXT_PARTITION + cmp colsd, 64 + je .case_64 +%endif + +%macro loop16 6 + mova m0, [srcq+%1] + mova m4, [srcq+%2] + mova m1, [predq+%3] + mova m5, [predq+%4] + punpckhbw m2, m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + psubw m2, m3 + psubw m0, m1 + punpckhbw m1, m4, m7 + punpckhbw m3, m5, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 + psubw m1, m3 + psubw m4, m5 + mova [diffq+mmsize*0+%5], m0 + mova [diffq+mmsize*1+%5], m2 + mova [diffq+mmsize*0+%6], m4 + mova [diffq+mmsize*1+%6], m1 +%endmacro + +%if CONFIG_EXT_PARTITION + mov pred_str, pred_stridemp +.loop_128: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize + loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + sub rowsd, 1 + jnz .loop_128 + RET + +.case_64: +%endif + mov pred_str, pred_stridemp +.loop_64: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_64 + RET + +.case_32: + mov pred_str, pred_stridemp +.loop_32: + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_32 + RET + +.case_16: + mov pred_str, pred_stridemp +.loop_16: + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 + lea diffq, [diffq+diff_strideq*4] + lea predq, [predq+pred_str*2] + lea srcq, [srcq+src_strideq*2] + sub rowsd, 2 + jg .loop_16 + RET + +%macro loop_h 0 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [predq] + movh m3, [predq+pred_str] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + psubw m0, m1 + psubw m2, m3 + mova [diffq], m0 + mova [diffq+diff_strideq*2], m2 +%endmacro + +.case_8: + mov pred_str, pred_stridemp +.loop_8: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_8 + RET + +INIT_MMX +.case_4: + mov pred_str, pred_stridemp +.loop_4: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_4 + RET diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c new file mode 100644 index 000000000..6be99fbca --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/x86/synonyms.h" + +#include "./aom_dsp_rtcd.h" + +static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, + int stride) { + const __m128i v_val_0_w = + _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); + const __m128i v_val_1_w = + _mm_loadl_epi64((const __m128i *)(src + 1 * stride)); + const __m128i v_val_2_w = + _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); + const __m128i v_val_3_w = + _mm_loadl_epi64((const __m128i *)(src + 3 * stride)); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + + const __m128i v_sum_d = + _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); + + return (uint64_t)_mm_cvtsi128_si32(v_sum_d); +} + +#ifdef __GNUC__ +// This prevents GCC/Clang from inlining this function into +// aom_sum_squares_2d_i16_sse2, which in turn saves some stack +// maintenance instructions in the common case of 4x4. +__attribute__((noinline)) +#endif +static uint64_t +aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, + int height) { + int r, c; + + const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + __m128i v_acc_q = _mm_setzero_si128(); + + for (r = 0; r < height; r += 8) { + __m128i v_acc_d = _mm_setzero_si128(); + + for (c = 0; c < width; c += 8) { + const int16_t *b = src + c; + + const __m128i v_val_0_w = + _mm_load_si128((const __m128i *)(b + 0 * stride)); + const __m128i v_val_1_w = + _mm_load_si128((const __m128i *)(b + 1 * stride)); + const __m128i v_val_2_w = + _mm_load_si128((const __m128i *)(b + 2 * stride)); + const __m128i v_val_3_w = + _mm_load_si128((const __m128i *)(b + 3 * stride)); + const __m128i v_val_4_w = + _mm_load_si128((const __m128i *)(b + 4 * stride)); + const __m128i v_val_5_w = + _mm_load_si128((const __m128i *)(b + 5 * stride)); + const __m128i v_val_6_w = + _mm_load_si128((const __m128i *)(b + 6 * stride)); + const __m128i v_val_7_w = + _mm_load_si128((const __m128i *)(b + 7 * stride)); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); + } + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); + + src += 8 * stride; + } + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + +#if ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(v_acc_q); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_acc_q); + return tmp; + } +#endif +} + +uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, + int height) { + // 4 elements per row only requires half an XMM register, so this + // must be a special case, but also note that over 75% of all calls + // are with size == 4, so it is also the common case. + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width % 8 == 0 && height % 8 == 0)) { + // Generic case + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} + +////////////////////////////////////////////////////////////////////////////// +// 1D version +////////////////////////////////////////////////////////////////////////////// + +static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { + const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + __m128i v_acc0_q = _mm_setzero_si128(); + __m128i v_acc1_q = _mm_setzero_si128(); + + const int16_t *const end = src + n; + + assert(n % 64 == 0); + + while (src < end) { + const __m128i v_val_0_w = xx_load_128(src); + const __m128i v_val_1_w = xx_load_128(src + 8); + const __m128i v_val_2_w = xx_load_128(src + 16); + const __m128i v_val_3_w = xx_load_128(src + 24); + const __m128i v_val_4_w = xx_load_128(src + 32); + const __m128i v_val_5_w = xx_load_128(src + 40); + const __m128i v_val_6_w = xx_load_128(src + 48); + const __m128i v_val_7_w = xx_load_128(src + 56); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); + + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); + v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); + + src += 64; + } + + v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); + v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); + +#if ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(v_acc0_q); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_acc0_q); + return tmp; + } +#endif +} + +uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { + if (n % 64 == 0) { + return aom_sum_squares_i16_64n_sse2(src, n); + } else if (n > 64) { + int k = n & ~(64 - 1); + return aom_sum_squares_i16_64n_sse2(src, k) + + aom_sum_squares_i16_c(src + k, n - k); + } else { + return aom_sum_squares_i16_c(src, n); + } +} diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h new file mode 100644 index 000000000..bef606dae --- /dev/null +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_SYNONYMS_H_ +#define AOM_DSP_X86_SYNONYMS_H_ + +#include + +#include "./aom_config.h" +#include "aom/aom_integer.h" + +/** + * Various reusable shorthands for x86 SIMD intrinsics. + * + * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. + * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. + */ + +// Loads and stores to do away with the tedium of casting the address +// to the right type. +static INLINE __m128i xx_loadl_32(const void *a) { + return _mm_cvtsi32_si128(*(const uint32_t *)a); +} + +static INLINE __m128i xx_loadl_64(const void *a) { + return _mm_loadl_epi64((const __m128i *)a); +} + +static INLINE __m128i xx_load_128(const void *a) { + return _mm_load_si128((const __m128i *)a); +} + +static INLINE __m128i xx_loadu_128(const void *a) { + return _mm_loadu_si128((const __m128i *)a); +} + +static INLINE void xx_storel_32(void *const a, const __m128i v) { + *(uint32_t *)a = _mm_cvtsi128_si32(v); +} + +static INLINE void xx_storel_64(void *const a, const __m128i v) { + _mm_storel_epi64((__m128i *)a, v); +} + +static INLINE void xx_store_128(void *const a, const __m128i v) { + _mm_store_si128((__m128i *)a, v); +} + +static INLINE void xx_storeu_128(void *const a, const __m128i v) { + _mm_storeu_si128((__m128i *)a, v); +} + +static INLINE __m128i xx_round_epu16(__m128i v_val_w) { + return _mm_avg_epu16(v_val_w, _mm_setzero_si128()); +} + +static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) { + const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1); + return _mm_avg_epu16(v_s_w, _mm_setzero_si128()); +} + +static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srli_epi32(v_tmp_d, bits); +} + +// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); + const __m128i v_tmp_d = + _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +#ifdef __SSSE3__ +static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) { + v_d = _mm_hadd_epi32(v_d, v_d); + v_d = _mm_hadd_epi32(v_d, v_d); + return _mm_cvtsi128_si32(v_d); +} + +static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) { + v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); +#if ARCH_X86_64 + return _mm_cvtsi128_si64(v_q); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_q); + return tmp; + } +#endif +} + +static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { + const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); + const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); + const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); + return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); +} +#endif // __SSSE3__ + +#endif // AOM_DSP_X86_SYNONYMS_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h new file mode 100644 index 000000000..39e9b8e2a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H +#define AOM_DSP_X86_TXFM_COMMON_AVX2_H + +#include + +#include "aom_dsp/txfm_common.h" + +#define pair256_set_epi16(a, b) \ + _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +#define pair256_set_epi32(a, b) \ + _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \ + (int)(b), (int)(a)) + +static INLINE void mm256_reverse_epi16(__m256i *u) { + const __m256i control = _mm256_set_epi16( + 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100, + 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E); + __m256i v = _mm256_shuffle_epi8(*u, control); + *u = _mm256_permute2x128_si256(v, v, 1); +} + +static INLINE void mm256_transpose_16x16(__m256i *in) { + __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); + __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); + __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); + __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); + __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); + __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); + __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); + __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); + __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); + __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); + __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); + __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); + + // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b + // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f + // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b + // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f + // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b + // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f + // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b + // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f + + // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b + // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f + // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb + // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf + // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db + // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df + // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb + // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff + + __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); + __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); + __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); + __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); + __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); + __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); + __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); + __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); + + __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); + __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); + __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); + __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); + __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); + __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); + __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); + __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); + + // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 + // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b + // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d + // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f + // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 + // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b + // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d + // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f + + // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 + // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb + // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd + // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf + // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 + // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb + // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd + // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff + + tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + + tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); + tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); + tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); + tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); + tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); + tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); + tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); + tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); + + // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a + // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b + // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c + // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d + // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e + // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f + + // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 + // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 + // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa + // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb + // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc + // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd + // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe + // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff + + in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 + in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 + in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); + in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); + in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); + in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); + in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); + in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); + + in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); + in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); + in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); + in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); + in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); + in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); + in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); + in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); +} + +static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) { + const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + __m256i y0 = _mm256_madd_epi16(a0, cospi); + __m256i y1 = _mm256_madd_epi16(a1, cospi); + + y0 = _mm256_add_epi32(y0, dct_rounding); + y1 = _mm256_add_epi32(y1, dct_rounding); + y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS); + y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS); + + return _mm256_packs_epi32(y0, y1); +} + +static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i sqrt2_epi16 = _mm256_set1_epi16(c); + const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + __m256i u0, u1; + int i = 0; + + while (i < 16) { + in[i] = _mm256_slli_epi16(in[i], 1); + + u0 = _mm256_unpacklo_epi16(zero, in[i]); + u1 = _mm256_unpackhi_epi16(zero, in[i]); + + u0 = _mm256_madd_epi16(u0, sqrt2_epi16); + u1 = _mm256_madd_epi16(u1, sqrt2_epi16); + + u0 = _mm256_add_epi32(u0, dct_const_rounding); + u1 = _mm256_add_epi32(u1, dct_const_rounding); + + u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); + u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); + in[i] = _mm256_packs_epi32(u0, u1); + i++; + } +} + +#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h new file mode 100644 index 000000000..e4ac56339 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ +#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { +#if CONFIG_HIGHBITDEPTH + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_storeu_si128((__m128i *)(dst_ptr), out0); + _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); +#else + _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); +#endif // CONFIG_HIGHBITDEPTH +} + +#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h new file mode 100644 index 000000000..4257d8b9c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#define AOM_DSP_X86_TXFM_COMMON_SSE2_H_ + +#include +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#define pair_set_epi16(a, b) \ + _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ + (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) + +#define dual_set_epi16(a, b) \ + _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ + (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) + +#define octa_set_epi16(a, b, c, d, e, f, g, h) \ + _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ + (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) + +// Reverse the 8 16 bit words in __m128i +static INLINE __m128i mm_reverse_epi16(const __m128i x) { + const __m128i a = _mm_shufflelo_epi16(x, 0x1b); + const __m128i b = _mm_shufflehi_epi16(a, 0x1b); + return _mm_shuffle_epi32(b, 0x4e); +} + +#if CONFIG_EXT_TX +// Identity transform (both forward and inverse). +static INLINE void idtx16_8col(__m128i *in) { + const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0); + const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + + __m128i v0, v1, v2, v3, v4, v5, v6, v7; + __m128i u0, u1, u2, u3, u4, u5, u6, u7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i y0, y1, y2, y3, y4, y5, y6, y7; + + in[0] = _mm_slli_epi16(in[0], 1); + in[1] = _mm_slli_epi16(in[1], 1); + in[2] = _mm_slli_epi16(in[2], 1); + in[3] = _mm_slli_epi16(in[3], 1); + in[4] = _mm_slli_epi16(in[4], 1); + in[5] = _mm_slli_epi16(in[5], 1); + in[6] = _mm_slli_epi16(in[6], 1); + in[7] = _mm_slli_epi16(in[7], 1); + in[8] = _mm_slli_epi16(in[8], 1); + in[9] = _mm_slli_epi16(in[9], 1); + in[10] = _mm_slli_epi16(in[10], 1); + in[11] = _mm_slli_epi16(in[11], 1); + in[12] = _mm_slli_epi16(in[12], 1); + in[13] = _mm_slli_epi16(in[13], 1); + in[14] = _mm_slli_epi16(in[14], 1); + in[15] = _mm_slli_epi16(in[15], 1); + + v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16); + v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16); + v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16); + v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16); + v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16); + v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16); + v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16); + v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16); + + u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16); + u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16); + u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16); + u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16); + u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16); + u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16); + u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16); + u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16); + + x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16); + x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16); + x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16); + x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16); + x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16); + x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16); + x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16); + x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16); + + y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16); + y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16); + y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16); + y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16); + y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16); + y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16); + y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16); + y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16); + + v0 = _mm_madd_epi16(v0, k__sqrt2_epi16); + v1 = _mm_madd_epi16(v1, k__sqrt2_epi16); + v2 = _mm_madd_epi16(v2, k__sqrt2_epi16); + v3 = _mm_madd_epi16(v3, k__sqrt2_epi16); + v4 = _mm_madd_epi16(v4, k__sqrt2_epi16); + v5 = _mm_madd_epi16(v5, k__sqrt2_epi16); + v6 = _mm_madd_epi16(v6, k__sqrt2_epi16); + v7 = _mm_madd_epi16(v7, k__sqrt2_epi16); + + x0 = _mm_madd_epi16(x0, k__sqrt2_epi16); + x1 = _mm_madd_epi16(x1, k__sqrt2_epi16); + x2 = _mm_madd_epi16(x2, k__sqrt2_epi16); + x3 = _mm_madd_epi16(x3, k__sqrt2_epi16); + x4 = _mm_madd_epi16(x4, k__sqrt2_epi16); + x5 = _mm_madd_epi16(x5, k__sqrt2_epi16); + x6 = _mm_madd_epi16(x6, k__sqrt2_epi16); + x7 = _mm_madd_epi16(x7, k__sqrt2_epi16); + + u0 = _mm_madd_epi16(u0, k__sqrt2_epi16); + u1 = _mm_madd_epi16(u1, k__sqrt2_epi16); + u2 = _mm_madd_epi16(u2, k__sqrt2_epi16); + u3 = _mm_madd_epi16(u3, k__sqrt2_epi16); + u4 = _mm_madd_epi16(u4, k__sqrt2_epi16); + u5 = _mm_madd_epi16(u5, k__sqrt2_epi16); + u6 = _mm_madd_epi16(u6, k__sqrt2_epi16); + u7 = _mm_madd_epi16(u7, k__sqrt2_epi16); + + y0 = _mm_madd_epi16(y0, k__sqrt2_epi16); + y1 = _mm_madd_epi16(y1, k__sqrt2_epi16); + y2 = _mm_madd_epi16(y2, k__sqrt2_epi16); + y3 = _mm_madd_epi16(y3, k__sqrt2_epi16); + y4 = _mm_madd_epi16(y4, k__sqrt2_epi16); + y5 = _mm_madd_epi16(y5, k__sqrt2_epi16); + y6 = _mm_madd_epi16(y6, k__sqrt2_epi16); + y7 = _mm_madd_epi16(y7, k__sqrt2_epi16); + + v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); + v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); + v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); + v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); + v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); + v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); + v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); + v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); + + x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING); + x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING); + x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING); + x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING); + x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING); + x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING); + x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING); + x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING); + + u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + + y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING); + y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING); + y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING); + y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING); + y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING); + y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING); + y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING); + y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING); + + v0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + v1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + v2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + v3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + v4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + v5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + v6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + v7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + + x0 = _mm_srai_epi32(x0, DCT_CONST_BITS); + x1 = _mm_srai_epi32(x1, DCT_CONST_BITS); + x2 = _mm_srai_epi32(x2, DCT_CONST_BITS); + x3 = _mm_srai_epi32(x3, DCT_CONST_BITS); + x4 = _mm_srai_epi32(x4, DCT_CONST_BITS); + x5 = _mm_srai_epi32(x5, DCT_CONST_BITS); + x6 = _mm_srai_epi32(x6, DCT_CONST_BITS); + x7 = _mm_srai_epi32(x7, DCT_CONST_BITS); + + u0 = _mm_srai_epi32(u0, DCT_CONST_BITS); + u1 = _mm_srai_epi32(u1, DCT_CONST_BITS); + u2 = _mm_srai_epi32(u2, DCT_CONST_BITS); + u3 = _mm_srai_epi32(u3, DCT_CONST_BITS); + u4 = _mm_srai_epi32(u4, DCT_CONST_BITS); + u5 = _mm_srai_epi32(u5, DCT_CONST_BITS); + u6 = _mm_srai_epi32(u6, DCT_CONST_BITS); + u7 = _mm_srai_epi32(u7, DCT_CONST_BITS); + + y0 = _mm_srai_epi32(y0, DCT_CONST_BITS); + y1 = _mm_srai_epi32(y1, DCT_CONST_BITS); + y2 = _mm_srai_epi32(y2, DCT_CONST_BITS); + y3 = _mm_srai_epi32(y3, DCT_CONST_BITS); + y4 = _mm_srai_epi32(y4, DCT_CONST_BITS); + y5 = _mm_srai_epi32(y5, DCT_CONST_BITS); + y6 = _mm_srai_epi32(y6, DCT_CONST_BITS); + y7 = _mm_srai_epi32(y7, DCT_CONST_BITS); + + in[0] = _mm_packs_epi32(v0, x0); + in[1] = _mm_packs_epi32(v1, x1); + in[2] = _mm_packs_epi32(v2, x2); + in[3] = _mm_packs_epi32(v3, x3); + in[4] = _mm_packs_epi32(v4, x4); + in[5] = _mm_packs_epi32(v5, x5); + in[6] = _mm_packs_epi32(v6, x6); + in[7] = _mm_packs_epi32(v7, x7); + + in[8] = _mm_packs_epi32(u0, y0); + in[9] = _mm_packs_epi32(u1, y1); + in[10] = _mm_packs_epi32(u2, y2); + in[11] = _mm_packs_epi32(u3, y3); + in[12] = _mm_packs_epi32(u4, y4); + in[13] = _mm_packs_epi32(u5, y5); + in[14] = _mm_packs_epi32(u6, y6); + in[15] = _mm_packs_epi32(u7, y7); +} +#endif // CONFIG_EXT_TX + +static INLINE void scale_sqrt2_8x4(__m128i *in) { + // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32 + // consecutive elements. + const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2); + + const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); + const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); + const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w); + const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w); + const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w); + const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w); + const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w); + const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w); + + const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w); + const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w); + const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w); + const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w); + + in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS)); + in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS)); + in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS)); + in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS)); +} + +static INLINE void scale_sqrt2_8x8(__m128i *in) { + // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)' + // for each element. + const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2); + + const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); + const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); + const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w); + const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w); + const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w); + const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w); + const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w); + const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w); + const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w); + const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w); + const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w); + const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w); + const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w); + const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w); + const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w); + const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w); + + const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w); + const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w); + const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w); + const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w); + const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w); + const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w); + const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w); + const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w); + const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w); + const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w); + const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w); + const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w); + const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w); + const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w); + + in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS)); + in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS)); + in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS)); + in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS)); + in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS)); + in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS)); + in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS)); + in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS), + xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS)); +} + +static INLINE void scale_sqrt2_8x16(__m128i *in) { + scale_sqrt2_8x8(in); + scale_sqrt2_8x8(in + 8); +} + +#endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c new file mode 100644 index 000000000..18a70dffe --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include "./aom_dsp_rtcd.h" + +typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void aom_get32x32var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse, + int *sum); + +static void variance_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, int w, int h, + unsigned int *sse, int *sum, get_var_avx2 var_fn, + int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += 16) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j], + ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + unsigned int variance; + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + aom_get16x16var_avx2, 16); + + variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); + _mm256_zeroupper(); + return *sse; +} + +unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + unsigned int variance; + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, + aom_get32x32var_avx2, 32); + + variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + unsigned int variance; + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, + aom_get32x32var_avx2, 32); + + variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + unsigned int variance; + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, + aom_get32x32var_avx2, 32); + + variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + unsigned int variance; + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, + aom_get32x32var_avx2, 32); + + variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse); + +unsigned int aom_sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sseptr); + +unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src, + int src_stride, int x_offset, + int y_offset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + unsigned int sse1; + const int se1 = aom_sub_pixel_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); + unsigned int sse2; + const int se2 = + aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, + dst + 32, dst_stride, 64, &sse2); + const int se = se1 + se2; + unsigned int variance; + *sse = sse1 + sse2; + + variance = *sse - (uint32_t)(((int64_t)se * se) >> 12); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src, + int src_stride, int x_offset, + int y_offset, const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + const int se = aom_sub_pixel_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); + + const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_sub_pixel_avg_variance64x64_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { + unsigned int sse1; + const int se1 = aom_sub_pixel_avg_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); + unsigned int sse2; + const int se2 = aom_sub_pixel_avg_variance32xh_avx2( + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, + 64, 64, &sse2); + const int se = se1 + se2; + unsigned int variance; + + *sse = sse1 + sse2; + + variance = *sse - (uint32_t)(((int64_t)se * se) >> 12); + _mm256_zeroupper(); + return variance; +} + +unsigned int aom_sub_pixel_avg_variance32x32_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { + // Process 32 elements in parallel. + const int se = aom_sub_pixel_avg_variance32xh_avx2( + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); + + const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10); + _mm256_zeroupper(); + return variance; +} diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c new file mode 100644 index 000000000..999b541e3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c @@ -0,0 +1,713 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // AVX2 + +#include "./aom_dsp_rtcd.h" +#include "aom_ports/mem.h" + +/* clang-format off */ +DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, + 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, +}; +/* clang-format on */ + +void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *SSE, int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i, src_2strides, ref_2strides; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing two strides in a 256 bit register reducing the number + // of loop stride by half (comparing to the sse2 code) + src_2strides = source_stride << 1; + ref_2strides = recon_stride << 1; + for (i = 0; i < 8; i++) { + src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr))); + src = _mm256_inserti128_si256( + src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1); + + ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr))); + ref = _mm256_inserti128_si256( + ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = + _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); + + src_ptr += src_2strides; + ref_ptr += ref_2strides; + } + + { + __m128i sum_res, madd_res; + __m128i expand_sum_low, expand_sum_high, expand_sum; + __m128i expand_madd_low, expand_madd_high, expand_madd; + __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + + // extract the low lane and add it to the high lane + sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), + _mm256_extractf128_si256(sum_ref_src, 1)); + + madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), + _mm256_extractf128_si256(madd_ref_src, 1)); + + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = + _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res); + expand_sum_high = + _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res); + + // shifting the sign 16 bits right + expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); + + expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); + + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = + _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); + expand_madd_high = + _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); + + expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); + + ex_expand_sum_low = + _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); + ex_expand_sum_high = + _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); + + ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + + // shift 8 bytes eight + madd_res = _mm_srli_si128(expand_madd, 8); + sum_res = _mm_srli_si128(ex_expand_sum, 8); + + madd_res = _mm_add_epi32(madd_res, expand_madd); + sum_res = _mm_add_epi32(sum_res, ex_expand_sum); + + *((int *)SSE) = _mm_cvtsi128_si32(madd_res); + + *((int *)Sum) = _mm_cvtsi128_si32(sum_res); + } + _mm256_zeroupper(); +} + +void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, + const unsigned char *ref_ptr, int recon_stride, + unsigned int *SSE, int *Sum) { + __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; + __m256i ref_expand_high, madd_low, madd_high; + unsigned int i; + __m256i zero_reg = _mm256_set1_epi16(0); + __m256i sum_ref_src = _mm256_set1_epi16(0); + __m256i madd_ref_src = _mm256_set1_epi16(0); + + // processing 32 elements in parallel + for (i = 0; i < 16; i++) { + src = _mm256_loadu_si256((__m256i const *)(src_ptr)); + + ref = _mm256_loadu_si256((__m256i const *)(ref_ptr)); + + // expanding to 16 bit each lane + src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); + src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); + + ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); + ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); + + // src-ref + src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); + src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); + + // madd low (src - ref) + madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); + + // add high to low + src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); + + // madd high (src - ref) + madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); + + sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); + + // add high to low + madd_ref_src = + _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); + + src_ptr += source_stride; + ref_ptr += recon_stride; + } + + { + __m256i expand_sum_low, expand_sum_high, expand_sum; + __m256i expand_madd_low, expand_madd_high, expand_madd; + __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; + + // padding each 2 bytes with another 2 zeroed bytes + expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); + expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); + + // shifting the sign 16 bits right + expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); + expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); + + expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); + + // expand each 32 bits of the madd result to 64 bits + expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); + expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); + + expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); + + ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); + ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); + + ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); + + // shift 8 bytes eight + madd_ref_src = _mm256_srli_si256(expand_madd, 8); + sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); + + madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); + sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); + + // extract the low lane and the high lane and add the results + *((int *)SSE) = + _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); + + *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); + } + _mm256_zeroupper(); +} + +#define FILTER_SRC(filter) \ + /* filter the source */ \ + exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ + exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ + \ + /* add 8 to source */ \ + exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ + exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ + \ + /* divide source by 16 */ \ + exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ + exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); + +#define MERGE_WITH_SRC(src_reg, reg) \ + exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ + exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); + +#define LOAD_SRC_DST \ + /* load source and destination */ \ + src_reg = _mm256_loadu_si256((__m256i const *)(src)); \ + dst_reg = _mm256_loadu_si256((__m256i const *)(dst)); + +#define AVG_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + /* average between current and next stride source */ \ + src_reg = _mm256_avg_epu8(src_reg, src_next_reg); + +#define MERGE_NEXT_SRC(src_reg, size_stride) \ + src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \ + MERGE_WITH_SRC(src_reg, src_next_reg) + +#define CALC_SUM_SSE_INSIDE_LOOP \ + /* expand each byte to 2 bytes */ \ + exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ + exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ + /* source - dest */ \ + exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ + exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ + /* caculate sum */ \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ + exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ + sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ + exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ + /* calculate sse */ \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ + sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); + +// final calculation to sum and sse +#define CALC_SUM_AND_SSE \ + res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ + sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ + sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ + \ + sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ + \ + sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ + sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ + sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ + sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ + _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); + +unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, unsigned int *sse) { + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height; i++) { + LOAD_SRC_DST + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + // save current source average + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src_pack = src_reg; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + } + CALC_SUM_AND_SSE + _mm256_zeroupper(); + return sum; +} + +unsigned int aom_sub_pixel_avg_variance32xh_avx2( + const uint8_t *src, int src_stride, int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, + int height, unsigned int *sse) { + __m256i sec_reg; + __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; + __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; + __m256i zero_reg; + int i, sum; + sum_reg = _mm256_set1_epi16(0); + sse_reg = _mm256_set1_epi16(0); + zero_reg = _mm256_set1_epi16(0); + + // x_offset = 0 and y_offset = 0 + if (x_offset == 0) { + if (y_offset == 0) { + for (i = 0; i < height; i++) { + LOAD_SRC_DST + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } else if (y_offset == 8) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, src_stride) + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expend each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 0 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg; + + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, src_stride) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + } + // x_offset = 8 and y_offset = 0 + } else if (x_offset == 8) { + if (y_offset == 0) { + __m256i src_next_reg; + for (i = 0; i < height; i++) { + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + sec += sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_reg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = 8 and y_offset = 8 + } else if (y_offset == 8) { + __m256i src_next_reg, src_avg; + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + // average between previous average to current average + src_avg = _mm256_avg_epu8(src_avg, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + sec += sec_stride; + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = 8 and y_offset = bilin interpolation + } else { + __m256i filter, pw8, src_next_reg, src_avg; + y_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + AVG_NEXT_SRC(src_reg, 1) + for (i = 0; i < height; i++) { + // save current source average + src_avg = src_reg; + src += src_stride; + LOAD_SRC_DST + AVG_NEXT_SRC(src_reg, 1) + MERGE_WITH_SRC(src_avg, src_reg) + FILTER_SRC(filter) + src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_avg = _mm256_avg_epu8(src_avg, sec_reg); + // expand each byte to 2 bytes + MERGE_WITH_SRC(src_avg, zero_reg) + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + // x_offset = bilin interpolation and y_offset = 0 + } else { + if (y_offset == 0) { + __m256i filter, pw8, src_next_reg; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + for (i = 0; i < height; i++) { + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_reg = _mm256_avg_epu8(src_reg, sec_reg); + MERGE_WITH_SRC(src_reg, zero_reg) + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + src += src_stride; + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = 8 + } else if (y_offset == 8) { + __m256i filter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + filter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + pw8 = _mm256_set1_epi16(8); + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(filter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // average between previous pack to the current + src_pack = _mm256_avg_epu8(src_pack, src_reg); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + sec += sec_stride; + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + // x_offset = bilin interpolation and y_offset = bilin interpolation + } else { + __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; + x_offset <<= 5; + xfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + x_offset)); + y_offset <<= 5; + yfilter = _mm256_load_si256( + (__m256i const *)(bilinear_filters_avx2 + y_offset)); + pw8 = _mm256_set1_epi16(8); + // load source and another source starting from the next + // following byte + src_reg = _mm256_loadu_si256((__m256i const *)(src)); + MERGE_NEXT_SRC(src_reg, 1) + + FILTER_SRC(xfilter) + // convert each 16 bit to 8 bit to each low and high lane source + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + for (i = 0; i < height; i++) { + src += src_stride; + LOAD_SRC_DST + MERGE_NEXT_SRC(src_reg, 1) + FILTER_SRC(xfilter) + src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + // merge previous pack to current pack source + MERGE_WITH_SRC(src_pack, src_reg) + // filter the source + FILTER_SRC(yfilter) + src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); + sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); + src_pack = _mm256_avg_epu8(src_pack, sec_reg); + MERGE_WITH_SRC(src_pack, zero_reg) + src_pack = src_reg; + sec += sec_stride; + CALC_SUM_SSE_INSIDE_LOOP + dst += dst_stride; + } + } + } + CALC_SUM_AND_SSE + _mm256_zeroupper(); + return sum; +} diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c new file mode 100644 index 000000000..d9563aa7f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -0,0 +1,690 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 + +#include "./aom_config.h" +#include "./aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" + +typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int aom_get_mb_ss_sse2(const int16_t *src) { + __m128i vsum = _mm_setzero_si128(); + int i; + + for (i = 0; i < 32; ++i) { + const __m128i v = _mm_loadu_si128((const __m128i *)src); + vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); + src += 8; + } + + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + return _mm_cvtsi128_si32(vsum); +} + +#define READ64(p, stride, i) \ + _mm_unpacklo_epi8( \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ + _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) + +static void get4x4var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); + const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); + const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); + const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + // sum + __m128i vsum = _mm_add_epi16(diff0, diff1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsum = + _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); + *sse = _mm_cvtsi128_si32(vsum); +} + +void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, + int ref_stride, unsigned int *sse, int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 8; i += 2) { + const __m128i src0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero); + const __m128i ref0 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero); + const __m128i ref1 = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + +void aom_get16x16var_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse, + int *sum) { + const __m128i zero = _mm_setzero_si128(); + __m128i vsum = _mm_setzero_si128(); + __m128i vsse = _mm_setzero_si128(); + int i; + + for (i = 0; i < 16; ++i) { + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i diff0 = _mm_sub_epi16(src0, ref0); + + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + const __m128i diff1 = _mm_sub_epi16(src1, ref1); + + vsum = _mm_add_epi16(vsum, diff0); + vsum = _mm_add_epi16(vsum, diff1); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); + vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); + + src += src_stride; + ref += ref_stride; + } + + // sum + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = + (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1); + + // sse + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); + vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); + *sse = _mm_cvtsi128_si32(vsse); +} + +static void variance_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, int w, + int h, unsigned int *sse, int *sum, + getNxMvar_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + assert(sum <= 255 * 4 * 4); + assert(sum >= -255 * 4 * 4); + return *sse - ((sum * sum) >> 4); +} + +unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum, + get4x4var_sse2, 4); + assert(sum <= 255 * 8 * 4); + assert(sum >= -255 * 8 * 4); + return *sse - ((sum * sum) >> 5); +} + +unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum, + get4x4var_sse2, 4); + assert(sum <= 255 * 8 * 4); + assert(sum >= -255 * 8 * 4); + return *sse - ((sum * sum) >> 5); +} + +unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + assert(sum <= 255 * 8 * 8); + assert(sum >= -255 * 8 * 8); + return *sse - ((sum * sum) >> 6); +} + +unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum, + aom_get8x8var_sse2, 8); + assert(sum <= 255 * 16 * 8); + assert(sum >= -255 * 16 * 8); + return *sse - ((sum * sum) >> 7); +} + +unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum, + aom_get8x8var_sse2, 8); + assert(sum <= 255 * 16 * 8); + assert(sum >= -255 * 16 * 8); + return *sse - ((sum * sum) >> 7); +} + +unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride, + const unsigned char *ref, int ref_stride, + unsigned int *sse) { + int sum; + aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); + assert(sum <= 255 * 16 * 16); + assert(sum >= -255 * 16 * 16); + return *sse - ((uint32_t)((int64_t)sum * sum) >> 8); +} + +unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 32 * 32); + assert(sum >= -255 * 32 * 32); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} + +unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 32 * 16); + assert(sum >= -255 * 32 * 16); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); +} + +unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 32 * 16); + assert(sum >= -255 * 32 * 16); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); +} + +unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 64 * 64); + assert(sum >= -255 * 64 * 64); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); +} + +unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 64 * 32); + assert(sum >= -255 * 64 * 32); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); +} + +unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 64 * 32); + assert(sum >= -255 * 64 * 32); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); +} + +unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse); + return *sse; +} + +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in subpel_variance.asm +#define DECL(w, opt) \ + int aom_sub_pixel_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ + void *unused0, void *unused) +#define DECLS(opt1, opt2) \ + DECL(4, opt1); \ + DECL(8, opt1); \ + DECL(16, opt1) + +DECLS(sse2, sse2); +DECLS(ssse3, ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + unsigned int sse; \ + int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + h, &sse, NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) + +FNS(sse2, sse2); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN + +// The 2 unused parameters are place holders for PIC enabled build. +#define DECL(w, opt) \ + int aom_sub_pixel_avg_variance##w##xh_##opt( \ + const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ + ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ + void *unused) +#define DECLS(opt1, opt2) \ + DECL(4, opt1); \ + DECL(8, opt1); \ + DECL(16, opt1) + +DECLS(sse2, sse2); +DECLS(ssse3, ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sseptr, \ + const uint8_t *sec) { \ + unsigned int sse; \ + int se = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ + NULL, NULL); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ + sec + 16, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + sec + 32, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + sec + 48, w, h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sseptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ + } + +#define FNS(opt1, opt2) \ + FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) + +FNS(sse2, sse); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN + +void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, + const uint8_t *ref, int ref_stride) { + int i, j; + int stride = ref_stride << 3; + + if (width >= 16) { + // read 16 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 16) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); + __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); + __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); + __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + t2 = _mm_unpacklo_epi8(s4, s5); + s5 = _mm_unpackhi_epi8(s4, s5); + t3 = _mm_unpacklo_epi8(s6, s7); + s7 = _mm_unpackhi_epi8(s6, s7); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s4 = _mm_unpacklo_epi8(t2, s5); + s6 = _mm_unpacklo_epi8(t3, s7); + s0 = _mm_unpacklo_epi32(s0, s2); + s4 = _mm_unpacklo_epi32(s4, s6); + s0 = _mm_unpacklo_epi64(s0, s4); + + _mm_storeu_si128((__m128i *)(comp_pred), s0); + comp_pred += 16; + ref += 16 * 8; + } + ref += stride - (width << 3); + } + } else if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s0 = _mm_unpacklo_epi32(s0, s2); + + _mm_storel_epi64((__m128i *)(comp_pred), s0); + comp_pred += 8; + ref += 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i t0; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + s0 = _mm_unpacklo_epi8(t0, s1); + + *(int *)comp_pred = _mm_cvtsi128_si32(s0); + comp_pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} + +void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride) { + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi16(1); + int i, j; + int stride = ref_stride << 3; + + if (width >= 16) { + // read 16 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 16) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); + __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); + __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); + __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + __m128i p1; + __m128i t0, t1, t2, t3; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + t2 = _mm_unpacklo_epi8(s4, s5); + s5 = _mm_unpackhi_epi8(s4, s5); + t3 = _mm_unpacklo_epi8(s6, s7); + s7 = _mm_unpackhi_epi8(s6, s7); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s4 = _mm_unpacklo_epi8(t2, s5); + s6 = _mm_unpacklo_epi8(t3, s7); + + s0 = _mm_unpacklo_epi32(s0, s2); + s4 = _mm_unpacklo_epi32(s4, s6); + s0 = _mm_unpacklo_epi8(s0, zero); + s4 = _mm_unpacklo_epi8(s4, zero); + + p1 = _mm_unpackhi_epi8(p0, zero); + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p1 = _mm_adds_epu16(s4, p1); + p0 = _mm_adds_epu16(p0, one); + p1 = _mm_adds_epu16(p1, one); + + p0 = _mm_srli_epi16(p0, 1); + p1 = _mm_srli_epi16(p1, 1); + p0 = _mm_packus_epi16(p0, p1); + + _mm_storeu_si128((__m128i *)(comp_pred), p0); + comp_pred += 16; + pred += 16; + ref += 16 * 8; + } + ref += stride - (width << 3); + } + } else if (width >= 8) { + // read 8 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); + __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); + __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); + __m128i t0, t1; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + t1 = _mm_unpacklo_epi8(s2, s3); + s3 = _mm_unpackhi_epi8(s2, s3); + + s0 = _mm_unpacklo_epi8(t0, s1); + s2 = _mm_unpacklo_epi8(t1, s3); + s0 = _mm_unpacklo_epi32(s0, s2); + s0 = _mm_unpacklo_epi8(s0, zero); + + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + p0 = _mm_packus_epi16(p0, zero); + + _mm_storel_epi64((__m128i *)(comp_pred), p0); + comp_pred += 8; + pred += 8; + ref += 8 * 8; + } + ref += stride - (width << 3); + } + } else { + // read 4 points at one time + for (i = 0; i < height; i++) { + for (j = 0; j < width; j += 4) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); + __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred); + __m128i t0; + + t0 = _mm_unpacklo_epi8(s0, s1); + s1 = _mm_unpackhi_epi8(s0, s1); + s0 = _mm_unpacklo_epi8(t0, s1); + s0 = _mm_unpacklo_epi8(s0, zero); + + p0 = _mm_unpacklo_epi8(p0, zero); + p0 = _mm_adds_epu16(s0, p0); + p0 = _mm_adds_epu16(p0, one); + p0 = _mm_srli_epi16(p0, 1); + p0 = _mm_packus_epi16(p0, zero); + + *(int *)comp_pred = _mm_cvtsi128_si32(p0); + comp_pred += 4; + pred += 4; + ref += 4 * 8; + } + ref += stride - (width << 3); + } + } +} -- cgit v1.2.3 From df9477dfa60ebb5d31bc142e58ce46535c17abce Mon Sep 17 00:00:00 2001 From: trav90 Date: Wed, 17 Oct 2018 05:59:08 -0500 Subject: Update aom to slightly newer commit ID --- third_party/aom/aom_dsp/aom_convolve.c | 267 +- third_party/aom/aom_dsp/aom_convolve.h | 5 + third_party/aom/aom_dsp/aom_dsp.cmake | 84 +- third_party/aom/aom_dsp/aom_dsp.mk | 42 +- third_party/aom/aom_dsp/aom_dsp_common.h | 2 + third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl | 177 +- third_party/aom/aom_dsp/arm/avg_neon.c | 38 - .../aom/aom_dsp/arm/bilinear_filter_media.asm | 240 -- third_party/aom/aom_dsp/arm/sad_media.asm | 98 - .../aom/aom_dsp/arm/subpel_variance_media.c | 81 - .../arm/variance_halfpixvar16x16_h_media.asm | 185 -- .../arm/variance_halfpixvar16x16_hv_media.asm | 225 -- .../arm/variance_halfpixvar16x16_v_media.asm | 187 -- third_party/aom/aom_dsp/arm/variance_media.asm | 361 --- third_party/aom/aom_dsp/avg.c | 42 - third_party/aom/aom_dsp/binary_codes_reader.c | 53 +- third_party/aom/aom_dsp/binary_codes_reader.h | 34 +- third_party/aom/aom_dsp/bitreader.h | 62 +- third_party/aom/aom_dsp/bitreader_buffer.c | 2 +- third_party/aom/aom_dsp/bitwriter.h | 63 +- third_party/aom/aom_dsp/dkboolreader.c | 110 - third_party/aom/aom_dsp/dkboolreader.h | 181 -- third_party/aom/aom_dsp/dkboolwriter.c | 44 - third_party/aom/aom_dsp/dkboolwriter.h | 104 - third_party/aom/aom_dsp/intrapred.c | 162 +- third_party/aom/aom_dsp/inv_txfm.c | 864 ++++++ third_party/aom/aom_dsp/loopfilter.c | 110 +- third_party/aom/aom_dsp/mips/avg_msa.c | 57 - third_party/aom/aom_dsp/prob.c | 17 - third_party/aom/aom_dsp/prob.h | 12 +- third_party/aom/aom_dsp/sad.c | 54 +- third_party/aom/aom_dsp/simd/v64_intrinsics.h | 4 +- third_party/aom/aom_dsp/variance.c | 365 +-- third_party/aom/aom_dsp/variance.h | 13 +- .../aom/aom_dsp/x86/aom_convolve_hip_sse2.c | 195 ++ .../aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c | 203 ++ third_party/aom/aom_dsp/x86/avg_intrin_sse2.c | 46 - third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c | 656 +++-- third_party/aom/aom_dsp/x86/inv_txfm_avx2.c | 1238 +++++++++ third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h | 80 + third_party/aom/aom_dsp/x86/inv_txfm_sse2.c | 103 + .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.c | 498 ++-- .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.c | 2799 +++++++------------- third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h | 45 + third_party/aom/aom_dsp/x86/obmc_sad_sse4.c | 1 + third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | 3 +- third_party/aom/aom_dsp/x86/synonyms.h | 28 - third_party/aom/aom_dsp/x86/txfm_common_avx2.h | 44 +- 48 files changed, 5193 insertions(+), 5091 deletions(-) delete mode 100644 third_party/aom/aom_dsp/arm/bilinear_filter_media.asm delete mode 100644 third_party/aom/aom_dsp/arm/sad_media.asm delete mode 100644 third_party/aom/aom_dsp/arm/subpel_variance_media.c delete mode 100644 third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm delete mode 100644 third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm delete mode 100644 third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm delete mode 100644 third_party/aom/aom_dsp/arm/variance_media.asm delete mode 100644 third_party/aom/aom_dsp/dkboolreader.c delete mode 100644 third_party/aom/aom_dsp/dkboolreader.h delete mode 100644 third_party/aom/aom_dsp/dkboolwriter.c delete mode 100644 third_party/aom/aom_dsp/dkboolwriter.h delete mode 100644 third_party/aom/aom_dsp/mips/avg_msa.c create mode 100644 third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h (limited to 'third_party/aom/aom_dsp') diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c index 74f4c00fb..4dac6aacc 100644 --- a/third_party/aom/aom_dsp/aom_convolve.c +++ b/third_party/aom/aom_dsp/aom_convolve.c @@ -337,14 +337,14 @@ static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h) { - int x, y; + int x, y, k; src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - int k, sum = 0; + int sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1]); @@ -359,7 +359,7 @@ static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h) { - int x, y; + int x, y, k; src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { @@ -367,7 +367,7 @@ static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride, for (y = 0; y < h; ++y) { const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - int k, sum = 0; + int sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_y[k * src_stride] * y_filter[k]; dst[y * dst_stride] = @@ -446,6 +446,127 @@ void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w, h); } + +static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + const int bd = 8; + int x, y, k; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + + (1 << (bd + FILTER_BITS - 1)); + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = + (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS), + 0, EXTRAPREC_CLAMP_LIMIT(bd) - 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h) { + const int bd = 8; + int x, y, k; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int sum = + ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - + (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)); + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS)); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, + const InterpKernel *const y_filters, int y0_q4, + int y_step_q4, int w, int h) { + uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), + src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, + x_step_q4, w, intermediate_height); + convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, + y_step_q4, w, h); +} + +void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + (void)filter_y; + (void)y_step_q4; + + convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, w, h); +} + +void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + (void)filter_x; + (void)x_step_q4; + + convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4, + y_step_q4, w, h); +} + +void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4, + x_step_q4, filters_y, y0_q4, y_step_q4, w, h); +} #endif // CONFIG_LOOP_RESTORATION #if CONFIG_HIGHBITDEPTH @@ -721,7 +842,7 @@ static void highbd_convolve_add_src_horiz(const uint8_t *src8, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { - int x, y; + int x, y, k; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; @@ -730,7 +851,7 @@ static void highbd_convolve_add_src_horiz(const uint8_t *src8, for (x = 0; x < w; ++x) { const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - int k, sum = 0; + int sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; dst[x] = clip_pixel_highbd( ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1], @@ -748,7 +869,7 @@ static void highbd_convolve_add_src_vert(const uint8_t *src8, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { - int x, y; + int x, y, k; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); @@ -757,7 +878,7 @@ static void highbd_convolve_add_src_vert(const uint8_t *src8, for (y = 0; y < h; ++y) { const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - int k, sum = 0; + int sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_y[k * src_stride] * y_filter[k]; dst[y * dst_stride] = @@ -850,5 +971,135 @@ void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd); } + +static void highbd_convolve_add_src_horiz_hip( + const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h, int bd) { + const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd); + int x, y, k; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + + (1 << (bd + FILTER_BITS - 1)); + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = + (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS), + 0, extraprec_clamp_limit - 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + +static void highbd_convolve_add_src_vert_hip( + const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { + int x, y, k; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { + int y_q4 = y0_q4; + for (y = 0; y < h; ++y) { + const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; + int sum = + ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - + (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)); + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel_highbd( + ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd); + y_q4 += y_step_q4; + } + ++src; + ++dst; + } +} + +static void highbd_convolve_add_src_hip( + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4, + int x_step_q4, const InterpKernel *const y_filters, int y0_q4, + int y_step_q4, int w, int h, int bd) { + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + assert(y_step_q4 <= 32); + assert(x_step_q4 <= 32); + + highbd_convolve_add_src_horiz_hip( + src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE, + x_filters, x0_q4, x_step_q4, w, intermediate_height, bd); + highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), + MAX_SB_SIZE, dst, dst_stride, y_filters, + y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_horiz_hip_c( + const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; + (void)y_step_q4; + + highbd_convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_vert_hip_c( + const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; + (void)x_step_q4; + + highbd_convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, + y0_q4, y_step_q4, w, h, bd); +} + +void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src, + ptrdiff_t src_stride, uint8_t *dst, + ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + const int x0_q4 = get_filter_offset(filter_x, filters_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + const int y0_q4 = get_filter_offset(filter_y, filters_y); + + highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, + x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w, + h, bd); +} + #endif // CONFIG_LOOP_RESTORATION #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/aom_convolve.h b/third_party/aom/aom_dsp/aom_convolve.h index d0de6c5d2..c7943dced 100644 --- a/third_party/aom/aom_dsp/aom_convolve.h +++ b/third_party/aom/aom_dsp/aom_convolve.h @@ -36,6 +36,11 @@ extern "C" { #define MAX_EXT_SIZE 135 #endif // CONFIG_AV1 && CONFIG_EXT_PARTITION +#if CONFIG_AV1 && CONFIG_LOOP_RESTORATION +#define EXTRAPREC_BITS 2 +#define EXTRAPREC_CLAMP_LIMIT(bd) (1 << ((bd) + 1 + EXTRAPREC_BITS)) +#endif + typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index f00348cbc..5a49ae817 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -8,6 +8,9 @@ ## Media Patent License 1.0 was not distributed with this source code in the ## PATENTS file, you can obtain it at www.aomedia.org/license/patent. ## +if (NOT AOM_AOM_DSP_AOM_DSP_CMAKE_) +set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1) + set(AOM_DSP_COMMON_SOURCES "${AOM_ROOT}/aom_dsp/aom_convolve.c" "${AOM_ROOT}/aom_dsp/aom_convolve.h" @@ -23,7 +26,6 @@ set(AOM_DSP_COMMON_SOURCES "${AOM_ROOT}/aom_dsp/loopfilter.c" "${AOM_ROOT}/aom_dsp/prob.c" "${AOM_ROOT}/aom_dsp/prob.h" - "${AOM_ROOT}/aom_dsp/sad.c" "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h" "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h" "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h" @@ -62,8 +64,10 @@ set(AOM_DSP_COMMON_INTRIN_SSE4_1 set(AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c") + "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h") set(AOM_DSP_COMMON_ASM_NEON "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm" @@ -175,6 +179,8 @@ set(AOM_DSP_COMMON_INTRIN_MSA if (CONFIG_HIGHBITDEPTH) set(AOM_DSP_COMMON_ASM_SSE2 ${AOM_DSP_COMMON_ASM_SSE2} + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm") set(AOM_DSP_COMMON_INTRIN_SSE2 @@ -198,7 +204,7 @@ if (CONFIG_ANS) set(AOM_DSP_COMMON_SOURCES ${AOM_DSP_COMMON_SOURCES} "${AOM_ROOT}/aom_dsp/ans.h") -elseif (CONFIG_DAALA_EC) +else () set(AOM_DSP_COMMON_SOURCES ${AOM_DSP_COMMON_SOURCES} "${AOM_ROOT}/aom_dsp/entcode.c" @@ -221,7 +227,7 @@ if (CONFIG_AV1) "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.h") endif () -if (CONFIG_DECODERS) +if (CONFIG_AV1_DECODER) set(AOM_DSP_DECODER_SOURCES "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" @@ -233,22 +239,17 @@ if (CONFIG_DECODERS) set(AOM_DSP_DECODER_SOURCES ${AOM_DSP_DECODER_SOURCES} "${AOM_ROOT}/aom_dsp/ansreader.h") - elseif (CONFIG_DAALA_EC) + else () set(AOM_DSP_DECODER_SOURCES ${AOM_DSP_DECODER_SOURCES} "${AOM_ROOT}/aom_dsp/daalaboolreader.c" "${AOM_ROOT}/aom_dsp/daalaboolreader.h" "${AOM_ROOT}/aom_dsp/entdec.c" "${AOM_ROOT}/aom_dsp/entdec.h") - else () - set(AOM_DSP_DECODER_SOURCES - ${AOM_DSP_DECODER_SOURCES} - "${AOM_ROOT}/aom_dsp/dkboolreader.c" - "${AOM_ROOT}/aom_dsp/dkboolreader.h") endif () endif () -if (CONFIG_ENCODERS) +if (CONFIG_AV1_ENCODER) set(AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" @@ -257,6 +258,7 @@ if (CONFIG_ENCODERS) "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" "${AOM_ROOT}/aom_dsp/psnr.c" "${AOM_ROOT}/aom_dsp/psnr.h" + "${AOM_ROOT}/aom_dsp/sad.c" "${AOM_ROOT}/aom_dsp/variance.c" "${AOM_ROOT}/aom_dsp/variance.h") @@ -282,6 +284,9 @@ if (CONFIG_ENCODERS) set(AOM_DSP_ENCODER_ASM_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sad_sse4.asm") set(AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.h" "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" @@ -310,11 +315,6 @@ if (CONFIG_ENCODERS) "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c" "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c") - set(AOM_DSP_ENCODER_INTRIN_SSSE3 - ${AOM_DSP_ENCODER_INTRIN_SSSE3} - "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" - "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c") - set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64 ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64} "${AOM_ROOT}/aom_dsp/x86/avg_ssse3_x86_64.asm" @@ -325,7 +325,6 @@ if (CONFIG_ENCODERS) "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm") set(AOM_DSP_ENCODER_INTRIN_MSA - "${AOM_ROOT}/aom_dsp/mips/avg_msa.c" "${AOM_ROOT}/aom_dsp/mips/sad_msa.c" "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c" "${AOM_ROOT}/aom_dsp/mips/variance_msa.c" @@ -345,9 +344,7 @@ if (CONFIG_ENCODERS) "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm") + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm") set(AOM_DSP_ENCODER_INTRIN_SSE2 ${AOM_DSP_ENCODER_INTRIN_SSE2} @@ -368,18 +365,13 @@ if (CONFIG_ENCODERS) "${AOM_ROOT}/aom_dsp/answriter.h" "${AOM_ROOT}/aom_dsp/buf_ans.c" "${AOM_ROOT}/aom_dsp/buf_ans.h") - elseif (CONFIG_DAALA_EC) + else () set(AOM_DSP_ENCODER_SOURCES ${AOM_DSP_ENCODER_SOURCES} "${AOM_ROOT}/aom_dsp/daalaboolwriter.c" "${AOM_ROOT}/aom_dsp/daalaboolwriter.h" "${AOM_ROOT}/aom_dsp/entenc.c" "${AOM_ROOT}/aom_dsp/entenc.h") - else () - set(AOM_DSP_ENCODER_SOURCES - ${AOM_DSP_ENCODER_SOURCES} - "${AOM_ROOT}/aom_dsp/dkboolwriter.c" - "${AOM_ROOT}/aom_dsp/dkboolwriter.h") endif () if (CONFIG_INTERNAL_STATS) @@ -392,6 +384,18 @@ if (CONFIG_ENCODERS) endif () endif () +if (CONFIG_LOOP_RESTORATION) + set(AOM_DSP_COMMON_INTRIN_SSE2 + ${AOM_DSP_COMMON_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_sse2.c") + + if (CONFIG_HIGHBITDEPTH) + set(AOM_DSP_COMMON_INTRIN_SSSE3 + ${AOM_DSP_COMMON_INTRIN_SSSE3} + "${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c") + endif () +endif () + if (CONFIG_MOTION_VAR) set(AOM_DSP_ENCODER_INTRIN_SSE4_1 ${AOM_DSP_ENCODER_INTRIN_SSE4_1} @@ -406,13 +410,13 @@ function (setup_aom_dsp_targets) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_common) target_sources(aom PUBLIC $) - if (CONFIG_DECODERS) + if (CONFIG_AV1_DECODER) add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_decoder) target_sources(aom PUBLIC $) endif () - if (CONFIG_ENCODERS) + if (CONFIG_AV1_ENCODER) add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_encoder) target_sources(aom PUBLIC $) @@ -422,14 +426,14 @@ function (setup_aom_dsp_targets) add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom") add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_SSE2") - if (CONFIG_ENCODERS) + if (CONFIG_AV1_ENCODER) add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom") add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_SSE2") endif() endif () - if (HAVE_SSE3 AND CONFIG_ENCODERS) + if (HAVE_SSE3 AND CONFIG_AV1_ENCODER) add_asm_library("aom_dsp_encoder_sse3" "AOM_DSP_ENCODER_INTRIN_SSE3" "aom") endif () @@ -438,21 +442,19 @@ function (setup_aom_dsp_targets) add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_SSSE3") - if (CONFIG_ENCODERS) + if (CONFIG_AV1_ENCODER) if ("${AOM_TARGET_CPU}" STREQUAL "x86_64") list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}) endif () add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom") - add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_SSSE3") endif () endif () if (HAVE_SSE4_1) add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_SSE4_1") - if (CONFIG_ENCODERS) + if (CONFIG_AV1_ENCODER) if (AOM_DSP_ENCODER_INTRIN_SSE4_1) add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_SSE4_1") @@ -463,14 +465,16 @@ function (setup_aom_dsp_targets) endif () if (HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") - add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64" - "aom") + if (CONFIG_AV1_ENCODER) + add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64" + "aom") + endif () endif () if (HAVE_AVX2) add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_AVX2") - if (CONFIG_ENCODERS) + if (CONFIG_AV1_ENCODER) add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_AVX2") endif () @@ -497,7 +501,7 @@ function (setup_aom_dsp_targets) if (HAVE_MSA) add_intrinsics_object_library("" "msa" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_MSA") - if (CONFIG_ENCODERS) + if (CONFIG_AV1_ENCODER) add_intrinsics_object_library("" "msa" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_MSA") endif () @@ -507,3 +511,5 @@ function (setup_aom_dsp_targets) # $AOM_LIB_TARGETS. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) endfunction () + +endif () # AOM_AOM_DSP_AOM_DSP_CMAKE_ diff --git a/third_party/aom/aom_dsp/aom_dsp.mk b/third_party/aom/aom_dsp/aom_dsp.mk index 8c7241b83..6e2d5630e 100644 --- a/third_party/aom/aom_dsp/aom_dsp.mk +++ b/third_party/aom/aom_dsp/aom_dsp.mk @@ -22,19 +22,16 @@ DSP_SRCS-yes += prob.h DSP_SRCS-yes += prob.c DSP_SRCS-$(CONFIG_ANS) += ans.h -ifeq ($(CONFIG_ENCODERS),yes) +ifeq ($(CONFIG_AV1_ENCODER),yes) ifeq ($(CONFIG_ANS),yes) DSP_SRCS-yes += answriter.h DSP_SRCS-yes += buf_ans.h DSP_SRCS-yes += buf_ans.c -else ifeq ($(CONFIG_DAALA_EC),yes) +else DSP_SRCS-yes += entenc.c DSP_SRCS-yes += entenc.h DSP_SRCS-yes += daalaboolwriter.c DSP_SRCS-yes += daalaboolwriter.h -else -DSP_SRCS-yes += dkboolwriter.h -DSP_SRCS-yes += dkboolwriter.c endif DSP_SRCS-yes += bitwriter.h DSP_SRCS-yes += bitwriter_buffer.c @@ -49,17 +46,14 @@ DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c endif -ifeq ($(CONFIG_DECODERS),yes) +ifeq ($(CONFIG_AV1_DECODER),yes) ifeq ($(CONFIG_ANS),yes) DSP_SRCS-yes += ansreader.h -else ifeq ($(CONFIG_DAALA_EC),yes) +else DSP_SRCS-yes += entdec.c DSP_SRCS-yes += entdec.h DSP_SRCS-yes += daalaboolreader.c DSP_SRCS-yes += daalaboolreader.h -else -DSP_SRCS-yes += dkboolreader.h -DSP_SRCS-yes += dkboolreader.c endif DSP_SRCS-yes += bitreader.h DSP_SRCS-yes += bitreader_buffer.c @@ -71,7 +65,7 @@ endif # intra predictions DSP_SRCS-yes += intrapred.c -ifeq ($(CONFIG_DAALA_EC),yes) +ifneq ($(CONFIG_ANS),yes) DSP_SRCS-yes += entcode.c DSP_SRCS-yes += entcode.h endif @@ -205,6 +199,7 @@ endif # CONFIG_HIGHBITDEPTH DSP_SRCS-yes += txfm_common.h DSP_SRCS-yes += x86/txfm_common_intrin.h DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h +DSP_SRCS-$(HAVE_SSSE3) += x86/obmc_intrinsic_ssse3.h DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h # forward transform @@ -239,6 +234,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c +DSP_SRCS-$(HAVE_AVX2) += x86/inv_txfm_common_avx2.h +DSP_SRCS-$(HAVE_AVX2) += x86/inv_txfm_avx2.c ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/save_reg_neon$(ASM) @@ -278,6 +275,13 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c endif # CONFIG_HIGHBITDEPTH + +ifeq ($(CONFIG_LOOP_RESTORATION),yes) +DSP_SRCS-$(HAVE_SSE2) += x86/aom_convolve_hip_sse2.c +ifeq ($(CONFIG_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_SSSE3) += x86/aom_highbd_convolve_hip_ssse3.c +endif +endif # CONFIG_LOOP_RESTORATION endif # CONFIG_AV1 # quantization @@ -298,7 +302,6 @@ endif DSP_SRCS-yes += avg.c DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c -DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c ifeq ($(ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm @@ -317,11 +320,10 @@ DSP_SRCS-yes += sum_squares.c DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c endif # CONFIG_AV1_ENCODER -ifeq ($(CONFIG_ENCODERS),yes) +ifeq ($(CONFIG_AV1_ENCODER),yes) DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c -DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c @@ -364,18 +366,12 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm endif # CONFIG_HIGHBITDEPTH -endif # CONFIG_ENCODERS +endif # CONFIG_AV1_ENCODER -ifneq ($(filter yes,$(CONFIG_ENCODERS)),) +ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),) DSP_SRCS-yes += variance.c DSP_SRCS-yes += variance.h -DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM) -DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c -DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM) -DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM) -DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM) -DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c @@ -402,7 +398,7 @@ DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm endif # CONFIG_HIGHBITDEPTH -endif # CONFIG_ENCODERS +endif # CONFIG_AV1_ENCODER DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h index 47ffbeb6c..82f9a95e9 100644 --- a/third_party/aom/aom_dsp/aom_dsp_common.h +++ b/third_party/aom/aom_dsp/aom_dsp_common.h @@ -31,6 +31,8 @@ extern "C" { #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y)) #define AOMMAX(x, y) (((x) > (y)) ? (x) : (y)) +#define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0])) + #define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') #define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0) diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index b4ef0d92f..8047cbc09 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -49,6 +49,9 @@ if (aom_config("CONFIG_TX64X64") eq "yes") { @pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153/; if (aom_config("CONFIG_ALT_INTRA") eq "yes") { push @pred_names, qw/paeth smooth/; + if (aom_config("CONFIG_SMOOTH_HV") eq "yes") { + push @pred_names, qw/smooth_v smooth_h/; + } } else { push @pred_names, 'tm'; } @@ -168,10 +171,14 @@ if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") { add_proto qw/void aom_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void aom_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; + add_proto qw/void aom_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; + add_proto qw/void aom_convolve8_add_src_horiz_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; + add_proto qw/void aom_convolve8_add_src_vert_hip/, "const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; specialize qw/aom_convolve8_add_src ssse3/; specialize qw/aom_convolve8_add_src_horiz ssse3/; specialize qw/aom_convolve8_add_src_vert ssse3/; + specialize qw/aom_convolve8_add_src_hip sse2/; } # CONFIG_LOOP_RESTORATION # TODO(any): These need to be extended to up to 128x128 block sizes @@ -215,8 +222,12 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; add_proto qw/void aom_highbd_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; add_proto qw/void aom_highbd_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void aom_highbd_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void aom_highbd_convolve8_add_src_horiz_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; + add_proto qw/void aom_highbd_convolve8_add_src_vert_hip/, "const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; specialize qw/aom_highbd_convolve8_add_src/, "$sse2_x86_64"; + specialize qw/aom_highbd_convolve8_add_src_hip ssse3/; # The _horiz/_vert functions are currently unused, so we don't bother # specialising them. } # CONFIG_LOOP_RESTORATION @@ -434,29 +445,30 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { specialize qw/aom_idct8x8_1_add sse2/; add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_256_add sse2/; + specialize qw/aom_idct16x16_256_add sse2 avx2/; add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct16x16_38_add avx2/; add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_10_add sse2/; + specialize qw/aom_idct16x16_10_add sse2 avx2/; add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_1_add sse2/; + specialize qw/aom_idct16x16_1_add sse2 avx2/; add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1024_add sse2 ssse3/; + specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/; add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_135_add sse2 ssse3/; + specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/; # Need to add 135 eob idct32x32 implementations. $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2; add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_34_add sse2 ssse3/; + specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/; add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1_add sse2/; + specialize qw/aom_idct32x32_1_add sse2 avx2/; add_proto qw/void aom_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; specialize qw/aom_highbd_idct4x4_16_add sse2/; @@ -479,21 +491,22 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { specialize qw/aom_idct8x8_12_add sse2 ssse3 neon dspr2 msa/; add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_1_add sse2 neon dspr2 msa/; + specialize qw/aom_idct16x16_1_add sse2 avx2 neon dspr2 msa/; add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_256_add sse2 neon dspr2 msa/; + specialize qw/aom_idct16x16_256_add sse2 avx2 neon dspr2 msa/; add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct16x16_38_add avx2/; add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_10_add sse2 neon dspr2 msa/; + specialize qw/aom_idct16x16_10_add sse2 avx2 neon dspr2 msa/; add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1024_add sse2 ssse3 neon dspr2 msa/; + specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2 neon dspr2 msa/; add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_135_add sse2 ssse3 neon dspr2 msa/; + specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2 neon dspr2 msa/; # Need to add 135 eob idct32x32 implementations. $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2; $aom_idct32x32_135_add_neon=aom_idct32x32_1024_add_neon; @@ -501,12 +514,12 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { $aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa; add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_34_add sse2 ssse3 neon dspr2 msa/; + specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2 neon dspr2 msa/; # Need to add 34 eob idct32x32 neon implementation. $aom_idct32x32_34_add_neon=aom_idct32x32_1024_add_neon; add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1_add sse2 neon dspr2 msa/; + specialize qw/aom_idct32x32_1_add sse2 avx2 neon dspr2 msa/; add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/aom_iwht4x4_1_add msa/; @@ -578,7 +591,7 @@ if (aom_config("CONFIG_AV1") eq "yes") { } } # CONFIG_AV1 -if (aom_config("CONFIG_ENCODERS") eq "yes") { +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # # Block subtraction # @@ -604,13 +617,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # # Avg # - add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p"; specialize qw/aom_avg_8x8 sse2 neon msa/; - add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p"; - specialize qw/aom_avg_4x4 sse2 neon msa/; if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p"; - add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p"; add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; specialize qw/aom_highbd_subtract_block sse2/; } @@ -652,22 +660,22 @@ foreach (@block_sizes) { add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; } -specialize qw/aom_sad128x128 avx2 sse2/; -specialize qw/aom_sad128x64 avx2 sse2/; -specialize qw/aom_sad64x128 avx2 sse2/; -specialize qw/aom_sad64x64 avx2 neon msa sse2/; -specialize qw/aom_sad64x32 avx2 msa sse2/; -specialize qw/aom_sad32x64 avx2 msa sse2/; -specialize qw/aom_sad32x32 avx2 neon msa sse2/; -specialize qw/aom_sad32x16 avx2 msa sse2/; -specialize qw/aom_sad16x32 msa sse2/; -specialize qw/aom_sad16x16 media neon msa sse2/; -specialize qw/aom_sad16x8 neon msa sse2/; -specialize qw/aom_sad8x16 neon msa sse2/; -specialize qw/aom_sad8x8 neon msa sse2/; -specialize qw/aom_sad8x4 msa sse2/; -specialize qw/aom_sad4x8 msa sse2/; -specialize qw/aom_sad4x4 neon msa sse2/; +specialize qw/aom_sad128x128 avx2 sse2/; +specialize qw/aom_sad128x64 avx2 sse2/; +specialize qw/aom_sad64x128 avx2 sse2/; +specialize qw/aom_sad64x64 avx2 neon msa sse2/; +specialize qw/aom_sad64x32 avx2 msa sse2/; +specialize qw/aom_sad32x64 avx2 msa sse2/; +specialize qw/aom_sad32x32 avx2 neon msa sse2/; +specialize qw/aom_sad32x16 avx2 msa sse2/; +specialize qw/aom_sad16x32 msa sse2/; +specialize qw/aom_sad16x16 neon msa sse2/; +specialize qw/aom_sad16x8 neon msa sse2/; +specialize qw/aom_sad8x16 neon msa sse2/; +specialize qw/aom_sad8x8 neon msa sse2/; +specialize qw/aom_sad8x4 msa sse2/; +specialize qw/aom_sad4x8 msa sse2/; +specialize qw/aom_sad4x4 neon msa sse2/; specialize qw/aom_sad128x128_avg avx2 sse2/; specialize qw/aom_sad128x64_avg avx2 sse2/; @@ -727,14 +735,14 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { if (aom_config("CONFIG_EXT_INTER") eq "yes") { foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; + add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask"; specialize "aom_masked_sad${w}x${h}", qw/ssse3/; } if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride"; + add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask"; specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/; } } @@ -876,9 +884,9 @@ if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") { add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; } } -} # CONFIG_ENCODERS +} # CONFIG_AV1_ENCODER -if (aom_config("CONFIG_ENCODERS") eq "yes") { +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # # Specialty Variance @@ -896,10 +904,10 @@ add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_str add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/aom_mse16x16 sse2 avx2 media neon msa/; -specialize qw/aom_mse16x8 sse2 msa/; -specialize qw/aom_mse8x16 sse2 msa/; -specialize qw/aom_mse8x8 sse2 msa/; +specialize qw/aom_mse16x16 sse2 avx2 neon msa/; +specialize qw/aom_mse16x8 sse2 msa/; +specialize qw/aom_mse8x16 sse2 msa/; +specialize qw/aom_mse8x8 sse2 msa/; if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { foreach $bd (8, 10, 12) { @@ -956,33 +964,33 @@ foreach (@block_sizes) { add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; } -specialize qw/aom_variance64x64 sse2 avx2 neon msa/; -specialize qw/aom_variance64x32 sse2 avx2 neon msa/; -specialize qw/aom_variance32x64 sse2 neon msa/; -specialize qw/aom_variance32x32 sse2 avx2 neon msa/; -specialize qw/aom_variance32x16 sse2 avx2 msa/; -specialize qw/aom_variance16x32 sse2 msa/; -specialize qw/aom_variance16x16 sse2 avx2 media neon msa/; -specialize qw/aom_variance16x8 sse2 neon msa/; -specialize qw/aom_variance8x16 sse2 neon msa/; -specialize qw/aom_variance8x8 sse2 media neon msa/; -specialize qw/aom_variance8x4 sse2 msa/; -specialize qw/aom_variance4x8 sse2 msa/; -specialize qw/aom_variance4x4 sse2 msa/; - -specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance16x16 media neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance8x8 media neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/; +specialize qw/aom_variance64x64 sse2 avx2 neon msa/; +specialize qw/aom_variance64x32 sse2 avx2 neon msa/; +specialize qw/aom_variance32x64 sse2 neon msa/; +specialize qw/aom_variance32x32 sse2 avx2 neon msa/; +specialize qw/aom_variance32x16 sse2 avx2 msa/; +specialize qw/aom_variance16x32 sse2 msa/; +specialize qw/aom_variance16x16 sse2 avx2 neon msa/; +specialize qw/aom_variance16x8 sse2 neon msa/; +specialize qw/aom_variance8x16 sse2 neon msa/; +specialize qw/aom_variance8x8 sse2 neon msa/; +specialize qw/aom_variance8x4 sse2 msa/; +specialize qw/aom_variance4x8 sse2 msa/; +specialize qw/aom_variance4x4 sse2 msa/; + +specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance8x8 neon msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/; +specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/; specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; @@ -1034,19 +1042,15 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") { # foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse"; - specialize "aom_masked_variance${w}x${h}", qw/ssse3/; + add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/; } if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach $bd ("_", "_10_", "_12_") { + foreach $bd ("_8_", "_10_", "_12_") { foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd${bd}masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse"; - specialize "aom_highbd${bd}masked_variance${w}x${h}", qw/ssse3/; + add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/; } } @@ -1119,13 +1123,13 @@ add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i # Specialty Subpixel # add_proto qw/uint32_t aom_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_variance_halfpixvar16x16_h sse2 media/; + specialize qw/aom_variance_halfpixvar16x16_h sse2/; add_proto qw/uint32_t aom_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_variance_halfpixvar16x16_v sse2 media/; + specialize qw/aom_variance_halfpixvar16x16_v sse2/; add_proto qw/uint32_t aom_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_variance_halfpixvar16x16_hv sse2 media/; + specialize qw/aom_variance_halfpixvar16x16_hv sse2/; # # Comp Avg @@ -1490,6 +1494,15 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { } # CONFIG_HIGHBITDEPTH -} # CONFIG_ENCODERS +if (aom_config("CONFIG_EXT_INTER") eq "yes") { + add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + } +} + +} # CONFIG_AV1_ENCODER 1; diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c index e730ccbcc..6ff760017 100644 --- a/third_party/aom/aom_dsp/arm/avg_neon.c +++ b/third_party/aom/aom_dsp/arm/avg_neon.c @@ -25,44 +25,6 @@ static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { return vget_lane_u32(c, 0); } -unsigned int aom_avg_4x4_neon(const uint8_t *s, int p) { - uint16x8_t v_sum; - uint32x2_t v_s0 = vdup_n_u32(0); - uint32x2_t v_s1 = vdup_n_u32(0); - v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0); - v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); - v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); - return (horizontal_add_u16x8(v_sum) + 8) >> 4; -} - -unsigned int aom_avg_8x8_neon(const uint8_t *s, int p) { - uint8x8_t v_s0 = vld1_u8(s); - const uint8x8_t v_s1 = vld1_u8(s + p); - uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); - - v_s0 = vld1_u8(s + 2 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 3 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 4 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 5 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 6 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 7 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - return (horizontal_add_u16x8(v_sum) + 32) >> 6; -} - // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. int aom_satd_neon(const int16_t *coeff, int length) { diff --git a/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm b/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm deleted file mode 100644 index 17b7d25f9..000000000 --- a/third_party/aom/aom_dsp/arm/bilinear_filter_media.asm +++ /dev/null @@ -1,240 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_filter_block2d_bil_first_pass_media| - EXPORT |aom_filter_block2d_bil_second_pass_media| - - AREA |.text|, CODE, READONLY ; name this block of code - -;------------------------------------- -; r0 unsigned char *src_ptr, -; r1 unsigned short *dst_ptr, -; r2 unsigned int src_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *aom_filter -;------------------------------------- -; The output is transposed stroed in output array to make it easy for second pass filtering. -|aom_filter_block2d_bil_first_pass_media| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; aom_filter address - ldr r4, [sp, #36] ; width - - mov r12, r3 ; outer-loop counter - - add r7, r2, r4 ; preload next row - pld [r0, r7] - - sub r2, r2, r4 ; src increment for height loop - - ldr r5, [r11] ; load up filter coefficients - - mov r3, r3, lsl #1 ; height*2 - add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) - - mov r11, r1 ; save dst_ptr for each row - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_1st_filter - -|bil_height_loop_1st_v6| - ldrb r6, [r0] ; load source data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - mov lr, r4, lsr #2 ; 4-in-parellel loop counter - -|bil_width_loop_1st_v6| - ldrb r9, [r0, #3] - ldrb r10, [r0, #4] - - pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] - pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] - - smuad r6, r6, r5 ; apply the filter - pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] - smuad r7, r7, r5 - pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] - - smuad r8, r8, r5 - smuad r9, r9, r5 - - add r0, r0, #4 - subs lr, lr, #1 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #16, r6, asr #7 - usat r7, #16, r7, asr #7 - - strh r6, [r1], r3 ; result is transposed and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strh r7, [r1], r3 - add r9, r9, #0x40 - usat r8, #16, r8, asr #7 - usat r9, #16, r9, asr #7 - - strh r8, [r1], r3 ; result is transposed and stored - - ldrneb r6, [r0] ; load source data - strh r9, [r1], r3 - - ldrneb r7, [r0, #1] - ldrneb r8, [r0, #2] - - bne bil_width_loop_1st_v6 - - add r0, r0, r2 ; move to next input row - subs r12, r12, #1 - - add r9, r2, r4, lsl #1 ; adding back block width - pld [r0, r9] ; preload next row - - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_1st_v6 - - ldmia sp!, {r4 - r11, pc} - -|bil_null_1st_filter| -|bil_height_loop_null_1st| - mov lr, r4, lsr #2 ; loop counter - -|bil_width_loop_null_1st| - ldrb r6, [r0] ; load data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - ldrb r9, [r0, #3] - - strh r6, [r1], r3 ; store it to immediate buffer - add r0, r0, #4 - strh r7, [r1], r3 - subs lr, lr, #1 - strh r8, [r1], r3 - strh r9, [r1], r3 - - bne bil_width_loop_null_1st - - subs r12, r12, #1 - add r0, r0, r2 ; move to next input line - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_null_1st - - ldmia sp!, {r4 - r11, pc} - - ENDP ; |aom_filter_block2d_bil_first_pass_media| - - -;--------------------------------- -; r0 unsigned short *src_ptr, -; r1 unsigned char *dst_ptr, -; r2 int dst_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *aom_filter -;--------------------------------- -|aom_filter_block2d_bil_second_pass_media| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; aom_filter address - ldr r4, [sp, #36] ; width - - ldr r5, [r11] ; load up filter coefficients - mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix - mov r11, r1 - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_2nd_filter - -|bil_height_loop_2nd| - ldr r6, [r0] ; load the data - ldr r8, [r0, #4] - ldrh r10, [r0, #8] - mov lr, r3, lsr #2 ; loop counter - -|bil_width_loop_2nd| - pkhtb r7, r6, r8 ; src[1] | src[2] - pkhtb r9, r8, r10 ; src[3] | src[4] - - smuad r6, r6, r5 ; apply filter - smuad r8, r8, r5 ; apply filter - - subs lr, lr, #1 - - smuadx r7, r7, r5 ; apply filter - smuadx r9, r9, r5 ; apply filter - - add r0, r0, #8 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #8, r6, asr #7 - usat r7, #8, r7, asr #7 - strb r6, [r1], r2 ; the result is transposed back and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strb r7, [r1], r2 - add r9, r9, #0x40 - usat r8, #8, r8, asr #7 - usat r9, #8, r9, asr #7 - strb r8, [r1], r2 ; the result is transposed back and stored - - ldrne r6, [r0] ; load data - strb r9, [r1], r2 - ldrne r8, [r0, #4] - ldrneh r10, [r0, #8] - - bne bil_width_loop_2nd - - subs r12, r12, #1 - add r0, r0, #4 ; update src for next row - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_2nd - ldmia sp!, {r4 - r11, pc} - -|bil_null_2nd_filter| -|bil_height_loop_null_2nd| - mov lr, r3, lsr #2 - -|bil_width_loop_null_2nd| - ldr r6, [r0], #4 ; load data - subs lr, lr, #1 - ldr r8, [r0], #4 - - strb r6, [r1], r2 ; store data - mov r7, r6, lsr #16 - strb r7, [r1], r2 - mov r9, r8, lsr #16 - strb r8, [r1], r2 - strb r9, [r1], r2 - - bne bil_width_loop_null_2nd - - subs r12, r12, #1 - add r0, r0, #4 - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_null_2nd - - ldmia sp!, {r4 - r11, pc} - ENDP ; |aom_filter_block2d_second_pass_media| - - END diff --git a/third_party/aom/aom_dsp/arm/sad_media.asm b/third_party/aom/aom_dsp/arm/sad_media.asm deleted file mode 100644 index 49ddb6764..000000000 --- a/third_party/aom/aom_dsp/arm/sad_media.asm +++ /dev/null @@ -1,98 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_sad16x16_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 const unsigned char *src_ptr -; r1 int src_stride -; r2 const unsigned char *ref_ptr -; r3 int ref_stride -|aom_sad16x16_media| PROC - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - mov r4, #0 ; sad = 0; - mov r5, #8 ; loop count - -loop - ; 1st row - ldr r6, [r0, #0x0] ; load 4 src pixels (1A) - ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) - ldr r7, [r0, #0x4] ; load 4 src pixels (1A) - ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) - ldr r10, [r0, #0x8] ; load 4 src pixels (1B) - ldr r11, [r0, #0xC] ; load 4 src pixels (1B) - - usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels - usad8 r8, r7, r9 ; calculate sad for 4 pixels - - ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) - ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) - - add r0, r0, r1 ; set src pointer to next row - add r2, r2, r3 ; set dst pointer to next row - - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels - usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels - - ldr r6, [r0, #0x0] ; load 4 src pixels (2A) - ldr r7, [r0, #0x4] ; load 4 src pixels (2A) - add r4, r4, r8 ; add partial sad values - - ; 2nd row - ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) - ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) - ldr r10, [r0, #0x8] ; load 4 src pixels (2B) - ldr r11, [r0, #0xC] ; load 4 src pixels (2B) - - usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels - usad8 r8, r7, r9 ; calculate sad for 4 pixels - - ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) - ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) - - add r0, r0, r1 ; set src pointer to next row - add r2, r2, r3 ; set dst pointer to next row - - usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels - usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels - - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - subs r5, r5, #1 ; decrement loop counter - add r4, r4, r8 ; add partial sad values - - bne loop - - mov r0, r4 ; return sad - ldmfd sp!, {r4-r12, pc} - - ENDP - - END - diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_media.c b/third_party/aom/aom_dsp/arm/subpel_variance_media.c deleted file mode 100644 index 46ec028d3..000000000 --- a/third_party/aom/aom_dsp/arm/subpel_variance_media.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -#if HAVE_MEDIA -static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 }, - { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, - { 32, 96 }, { 16, 112 } }; - -extern void aom_filter_block2d_bil_first_pass_media( - const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch, - uint32_t height, uint32_t width, const int16_t *filter); - -extern void aom_filter_block2d_bil_second_pass_media( - const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch, - uint32_t height, uint32_t width, const int16_t *filter); - -unsigned int aom_sub_pixel_variance8x8_media( - const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t first_pass[10 * 8]; - uint8_t second_pass[8 * 8]; - const int16_t *HFilter, *VFilter; - - HFilter = bilinear_filters_media[xoffset]; - VFilter = bilinear_filters_media[yoffset]; - - aom_filter_block2d_bil_first_pass_media(src_ptr, first_pass, - src_pixels_per_line, 9, 8, HFilter); - aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8, - VFilter); - - return aom_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line, - sse); -} - -unsigned int aom_sub_pixel_variance16x16_media( - const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t first_pass[36 * 16]; - uint8_t second_pass[20 * 16]; - const int16_t *HFilter, *VFilter; - unsigned int var; - - if (xoffset == 4 && yoffset == 0) { - var = aom_variance_halfpixvar16x16_h_media( - src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - } else if (xoffset == 0 && yoffset == 4) { - var = aom_variance_halfpixvar16x16_v_media( - src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - } else if (xoffset == 4 && yoffset == 4) { - var = aom_variance_halfpixvar16x16_hv_media( - src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - } else { - HFilter = bilinear_filters_media[xoffset]; - VFilter = bilinear_filters_media[yoffset]; - - aom_filter_block2d_bil_first_pass_media( - src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter); - aom_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16, - 16, VFilter); - - var = aom_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line, - sse); - } - return var; -} -#endif // HAVE_MEDIA diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm deleted file mode 100644 index 1e5c9178e..000000000 --- a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_h_media.asm +++ /dev/null @@ -1,185 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_variance_halfpixvar16x16_h_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|aom_variance_halfpixvar16x16_h_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #0] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #4] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #8] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #12] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END - diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm deleted file mode 100644 index 9e0af830e..000000000 --- a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_hv_media.asm +++ /dev/null @@ -1,225 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_variance_halfpixvar16x16_hv_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|aom_variance_halfpixvar16x16_hv_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - add r9, r0, r1 ; pointer to pixels on the next row - ; 1st 4 pixels - ldr r4, [r0, #0] ; load source pixels a, row N - ldr r6, [r0, #1] ; load source pixels b, row N - ldr r5, [r9, #0] ; load source pixels c, row N+1 - ldr r7, [r9, #1] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #0] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load source pixels a, row N - ldr r6, [r0, #5] ; load source pixels b, row N - ldr r5, [r9, #4] ; load source pixels c, row N+1 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - ldr r7, [r9, #5] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #4] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load source pixels a, row N - ldr r6, [r0, #9] ; load source pixels b, row N - ldr r5, [r9, #8] ; load source pixels c, row N+1 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - ldr r7, [r9, #9] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #8] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load source pixels a, row N - ldr r6, [r0, #13] ; load source pixels b, row N - ldr r5, [r9, #12] ; load source pixels c, row N+1 - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - ldr r7, [r9, #13] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #12] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END diff --git a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm b/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm deleted file mode 100644 index 545b68179..000000000 --- a/third_party/aom/aom_dsp/arm/variance_halfpixvar16x16_v_media.asm +++ /dev/null @@ -1,187 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_variance_halfpixvar16x16_v_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|aom_variance_halfpixvar16x16_v_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - add r9, r0, r1 ; set src pointer to next row - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r6, [r9, #0] ; load 4 src pixels from next row - ldr r5, [r2, #0] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r6, [r9, #4] ; load 4 src pixels from next row - ldr r5, [r2, #4] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r6, [r9, #8] ; load 4 src pixels from next row - ldr r5, [r2, #8] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r6, [r9, #12] ; load 4 src pixels from next row - ldr r5, [r2, #12] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END - diff --git a/third_party/aom/aom_dsp/arm/variance_media.asm b/third_party/aom/aom_dsp/arm/variance_media.asm deleted file mode 100644 index fdc311a81..000000000 --- a/third_party/aom/aom_dsp/arm/variance_media.asm +++ /dev/null @@ -1,361 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_variance16x16_media| - EXPORT |aom_variance8x8_media| - EXPORT |aom_mse16x16_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|aom_variance16x16_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - -loop16x16 - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r5, [r2, #0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r5, [r2, #4] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r5, [r2, #8] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r5, [r2, #12] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop16x16 - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|aom_variance8x8_media| PROC - - push {r4-r10, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #8 ; set loop counter to 8 (=block height) - mov r4, #0 ; initialize sum = 0 - mov r5, #0 ; initialize sse = 0 - -loop8x8 - ; 1st 4 pixels - ldr r6, [r0, #0x0] ; load 4 src pixels - ldr r7, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r6, r7 ; calculate difference - pld [r0, r1, lsl #1] - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r0, #0x4] ; load 4 src pixels - ldr r7, [r2, #0x4] ; load 4 ref pixels - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r6, r7 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 ; next row - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - bne loop8x8 - - ; return stuff - ldr r8, [sp, #32] ; get address of sse - mul r1, r4, r4 ; sum * sum - str r5, [r8] ; store sse - sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) - - pop {r4-r10, pc} - - ENDP - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -; -;note: Based on aom_variance16x16_media. In this function, sum is never used. -; So, we can remove this part of calculation. - -|aom_mse16x16_media| PROC - - push {r4-r9, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #16 ; set loop counter to 16 (=block height) - mov r4, #0 ; initialize sse = 0 - -loopmse - ; 1st 4 pixels - ldr r5, [r0, #0x0] ; load 4 src pixels - ldr r6, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r5, r6 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0x4] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r2, #0x4] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - ldr r5, [r0, #0x8] ; load 4 src pixels - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r6, [r2, #0x8] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0xc] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r6, [r2, #0xc] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - subs r12, r12, #1 ; next row - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - bne loopmse - - ; return stuff - ldr r1, [sp, #28] ; get address of sse - mov r0, r4 ; return sse - str r4, [r1] ; store sse - - pop {r4-r9, pc} - - ENDP - - END diff --git a/third_party/aom/aom_dsp/avg.c b/third_party/aom/aom_dsp/avg.c index eb6059705..f732224fd 100644 --- a/third_party/aom/aom_dsp/avg.c +++ b/third_party/aom/aom_dsp/avg.c @@ -13,26 +13,6 @@ #include "./aom_dsp_rtcd.h" #include "aom_ports/mem.h" -unsigned int aom_avg_8x8_c(const uint8_t *src, int stride) { - int i, j; - int sum = 0; - for (i = 0; i < 8; ++i, src += stride) - for (j = 0; j < 8; sum += src[j], ++j) { - } - - return ROUND_POWER_OF_TWO(sum, 6); -} - -unsigned int aom_avg_4x4_c(const uint8_t *src, int stride) { - int i, j; - int sum = 0; - for (i = 0; i < 4; ++i, src += stride) - for (j = 0; j < 4; sum += src[j], ++j) { - } - - return ROUND_POWER_OF_TWO(sum, 4); -} - // src_diff: first pass, 9 bit, dynamic range [-255, 255] // second pass, 12 bit, dynamic range [-2040, 2040] static void hadamard_col8(const int16_t *src_diff, int src_stride, @@ -192,28 +172,6 @@ void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref, } #if CONFIG_HIGHBITDEPTH -unsigned int aom_highbd_avg_8x8_c(const uint8_t *src, int stride) { - int i, j; - int sum = 0; - const uint16_t *s = CONVERT_TO_SHORTPTR(src); - for (i = 0; i < 8; ++i, s += stride) - for (j = 0; j < 8; sum += s[j], ++j) { - } - - return ROUND_POWER_OF_TWO(sum, 6); -} - -unsigned int aom_highbd_avg_4x4_c(const uint8_t *src, int stride) { - int i, j; - int sum = 0; - const uint16_t *s = CONVERT_TO_SHORTPTR(src); - for (i = 0; i < 4; ++i, s += stride) - for (j = 0; j < 4; sum += s[j], ++j) { - } - - return ROUND_POWER_OF_TWO(sum, 4); -} - void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max) { int i, j; diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c index 96c4cb436..bf304dada 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.c +++ b/third_party/aom/aom_dsp/binary_codes_reader.c @@ -9,7 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "aom_dsp/bitreader.h" +#include "aom_dsp/binary_codes_reader.h" #include "av1/common/common.h" @@ -33,26 +33,28 @@ static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { } } -int16_t aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits) { - if (aom_read_bit(r, NULL)) { - int s = aom_read_bit(r, NULL); - int16_t x = aom_read_literal(r, mag_bits, NULL) + 1; +int16_t aom_read_primitive_symmetric_(aom_reader *r, + unsigned int mag_bits ACCT_STR_PARAM) { + if (aom_read_bit(r, ACCT_STR_NAME)) { + int s = aom_read_bit(r, ACCT_STR_NAME); + int16_t x = aom_read_literal(r, mag_bits, ACCT_STR_NAME) + 1; return (s > 0 ? -x : x); } else { return 0; } } -uint16_t aom_read_primitive_quniform(aom_reader *r, uint16_t n) { +uint16_t aom_read_primitive_quniform_(aom_reader *r, + uint16_t n ACCT_STR_PARAM) { if (n <= 1) return 0; const int l = get_msb(n - 1) + 1; const int m = (1 << l) - n; - const int v = aom_read_literal(r, l - 1, NULL); - return v < m ? v : (v << 1) - m + aom_read_bit(r, NULL); + const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME); + return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME); } -uint16_t aom_read_primitive_refbilevel(aom_reader *r, uint16_t n, uint16_t p, - uint16_t ref) { +uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p, + uint16_t ref ACCT_STR_PARAM) { if (n <= 1) return 0; assert(p > 0 && p <= n); assert(ref < n); @@ -64,10 +66,10 @@ uint16_t aom_read_primitive_refbilevel(aom_reader *r, uint16_t n, uint16_t p, lolimit = n - p; } int v; - if (aom_read_bit(r, NULL)) { - v = aom_read_primitive_quniform(r, p) + lolimit; + if (aom_read_bit(r, ACCT_STR_NAME)) { + v = aom_read_primitive_quniform(r, p, ACCT_STR_NAME) + lolimit; } else { - v = aom_read_primitive_quniform(r, n - p); + v = aom_read_primitive_quniform(r, n - p, ACCT_STR_NAME); if (v >= lolimit) v += p; } return v; @@ -75,7 +77,8 @@ uint16_t aom_read_primitive_refbilevel(aom_reader *r, uint16_t n, uint16_t p, // Decode finite subexponential code that for a symbol v in [0, n-1] with // parameter k -uint16_t aom_read_primitive_subexpfin(aom_reader *r, uint16_t n, uint16_t k) { +uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, + uint16_t k ACCT_STR_PARAM) { int i = 0; int mk = 0; uint16_t v; @@ -83,14 +86,14 @@ uint16_t aom_read_primitive_subexpfin(aom_reader *r, uint16_t n, uint16_t k) { int b = (i ? k + i - 1 : k); int a = (1 << b); if (n <= mk + 3 * a) { - v = aom_read_primitive_quniform(r, n - mk) + mk; + v = aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk; break; } else { - if (aom_read_bit(r, NULL)) { + if (aom_read_bit(r, ACCT_STR_NAME)) { i = i + 1; mk += a; } else { - v = aom_read_literal(r, b, NULL) + mk; + v = aom_read_literal(r, b, ACCT_STR_NAME) + mk; break; } } @@ -101,17 +104,19 @@ uint16_t aom_read_primitive_subexpfin(aom_reader *r, uint16_t n, uint16_t k) { // Decode finite subexponential code that for a symbol v in [0, n-1] with // parameter k // based on a reference ref also in [0, n-1]. -uint16_t aom_read_primitive_refsubexpfin(aom_reader *r, uint16_t n, uint16_t k, - uint16_t ref) { - return inv_recenter_finite_nonneg(n, ref, - aom_read_primitive_subexpfin(r, n, k)); +uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, + uint16_t ref ACCT_STR_PARAM) { + return inv_recenter_finite_nonneg( + n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME)); } // Decode finite subexponential code that for a symbol v in [-(n-1), n-1] with // parameter k based on a reference ref also in [-(n-1), n-1]. -int16_t aom_read_signed_primitive_refsubexpfin(aom_reader *r, uint16_t n, - uint16_t k, int16_t ref) { +int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n, + uint16_t k, + int16_t ref ACCT_STR_PARAM) { ref += n - 1; const uint16_t scaled_n = (n << 1) - 1; - return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref) - n + 1; + return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref, ACCT_STR_NAME) - + n + 1; } diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h index 738d91da8..1540cf46b 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.h +++ b/third_party/aom/aom_dsp/binary_codes_reader.h @@ -21,16 +21,32 @@ extern "C" { #include "aom/aom_integer.h" #include "aom_dsp/bitreader.h" -int16_t aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits); +#define aom_read_primitive_symmetric(r, n, ACCT_STR_NAME) \ + aom_read_primitive_symmetric_(r, n ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \ + aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_primitive_refbilevel(r, n, p, ref, ACCT_STR_NAME) \ + aom_read_primitive_refbilevel_(r, n, p, ref ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \ + aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \ + aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME)) +#define aom_read_signed_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \ + aom_read_signed_primitive_refsubexpfin_(r, n, k, \ + ref ACCT_STR_ARG(ACCT_STR_NAME)) -uint16_t aom_read_primitive_quniform(aom_reader *r, uint16_t n); -uint16_t aom_read_primitive_refbilevel(aom_reader *r, uint16_t n, uint16_t p, - uint16_t ref); -uint16_t aom_read_primitive_subexpfin(aom_reader *r, uint16_t n, uint16_t k); -uint16_t aom_read_primitive_refsubexpfin(aom_reader *r, uint16_t n, uint16_t k, - uint16_t ref); -int16_t aom_read_signed_primitive_refsubexpfin(aom_reader *r, uint16_t n, - uint16_t k, int16_t ref); +int16_t aom_read_primitive_symmetric_(aom_reader *r, + unsigned int mag_bits ACCT_STR_PARAM); +uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM); +uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p, + uint16_t ref ACCT_STR_PARAM); +uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, + uint16_t k ACCT_STR_PARAM); +uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, + uint16_t ref ACCT_STR_PARAM); +int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n, + uint16_t k, + int16_t ref ACCT_STR_PARAM); #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h index 9cd34dd48..5bad70cb3 100644 --- a/third_party/aom/aom_dsp/bitreader.h +++ b/third_party/aom/aom_dsp/bitreader.h @@ -16,18 +16,13 @@ #include #include "./aom_config.h" -#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL -#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL." -#endif #include "aom/aomdx.h" #include "aom/aom_integer.h" #if CONFIG_ANS #include "aom_dsp/ansreader.h" -#elif CONFIG_DAALA_EC -#include "aom_dsp/daalaboolreader.h" #else -#include "aom_dsp/dkboolreader.h" +#include "aom_dsp/daalaboolreader.h" #endif #include "aom_dsp/prob.h" #include "av1/common/odintrin.h" @@ -61,26 +56,20 @@ extern "C" { #if CONFIG_ANS typedef struct AnsDecoder aom_reader; -#elif CONFIG_DAALA_EC -typedef struct daala_reader aom_reader; #else -typedef struct aom_dk_reader aom_reader; +typedef struct daala_reader aom_reader; #endif static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size, aom_decrypt_cb decrypt_cb, void *decrypt_state) { -#if CONFIG_ANS (void)decrypt_cb; (void)decrypt_state; +#if CONFIG_ANS if (size > INT_MAX) return 1; return ans_read_init(r, buffer, (int)size); -#elif CONFIG_DAALA_EC - (void)decrypt_cb; - (void)decrypt_state; - return aom_daala_reader_init(r, buffer, (int)size); #else - return aom_dk_reader_init(r, buffer, size, decrypt_cb, decrypt_state); + return aom_daala_reader_init(r, buffer, (int)size); #endif } @@ -89,20 +78,16 @@ static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) { (void)r; assert(0 && "Use the raw buffer size with ANS"); return NULL; -#elif CONFIG_DAALA_EC - return aom_daala_reader_find_end(r); #else - return aom_dk_reader_find_end(r); + return aom_daala_reader_find_end(r); #endif } static INLINE int aom_reader_has_error(aom_reader *r) { #if CONFIG_ANS return ans_reader_has_error(r); -#elif CONFIG_DAALA_EC - return aom_daala_reader_has_error(r); #else - return aom_dk_reader_has_error(r); + return aom_daala_reader_has_error(r); #endif } @@ -112,10 +97,8 @@ static INLINE uint32_t aom_reader_tell(const aom_reader *r) { (void)r; assert(0 && "aom_reader_tell() is unimplemented for ANS"); return 0; -#elif CONFIG_DAALA_EC - return aom_daala_reader_tell(r); #else - return aom_dk_reader_tell(r); + return aom_daala_reader_tell(r); #endif } @@ -125,10 +108,8 @@ static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) { (void)r; assert(0 && "aom_reader_tell_frac() is unimplemented for ANS"); return 0; -#elif CONFIG_DAALA_EC - return aom_daala_reader_tell_frac(r); #else - return aom_dk_reader_tell_frac(r); + return aom_daala_reader_tell_frac(r); #endif } @@ -155,10 +136,8 @@ static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) { int ret; #if CONFIG_ANS ret = rabs_read(r, prob); -#elif CONFIG_DAALA_EC - ret = aom_daala_read(r, prob); #else - ret = aom_dk_read(r, prob); + ret = aom_daala_read(r, prob); #endif #if CONFIG_ACCOUNTING if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); @@ -171,7 +150,7 @@ static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) { int ret; #if CONFIG_ANS ret = rabs_read_bit(r); // Non trivial optimization at half probability -#elif CONFIG_DAALA_EC && CONFIG_RAWBITS +#elif CONFIG_RAWBITS // Note this uses raw bits and is not the same as aom_daala_read(r, 128); // Calls to this function are omitted from raw symbol accounting. ret = aom_daala_read_bit(r); @@ -194,28 +173,14 @@ static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) { return literal; } -static INLINE int aom_read_tree_as_bits(aom_reader *r, - const aom_tree_index *tree, - const aom_prob *probs) { - aom_tree_index i = 0; - - while ((i = tree[i + aom_read(r, probs[i >> 1], NULL)]) > 0) continue; - return -i; -} - -#if CONFIG_EC_MULTISYMBOL static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf, int nsymbs ACCT_STR_PARAM) { int ret; #if CONFIG_ANS (void)nsymbs; ret = rans_read(r, cdf); -#elif CONFIG_DAALA_EC - ret = daala_read_symbol(r, cdf, nsymbs); #else -#error \ - "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \ - "coder. Enable daala_ec or ans for a valid configuration." + ret = daala_read_symbol(r, cdf, nsymbs); #endif #if CONFIG_ACCOUNTING @@ -253,16 +218,11 @@ static INLINE int aom_read_tree_as_cdf(aom_reader *r, } while (i > 0); return -i; } -#endif // CONFIG_EC_MULTISYMBOL static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree, const aom_prob *probs ACCT_STR_PARAM) { int ret; -#if CONFIG_EC_MULTISYMBOL ret = aom_read_tree_as_cdf(r, tree, probs); -#else - ret = aom_read_tree_as_bits(r, tree, probs); -#endif #if CONFIG_ACCOUNTING if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); #endif diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c index 009682b4c..e51b1cc3a 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.c +++ b/third_party/aom/aom_dsp/bitreader_buffer.c @@ -24,7 +24,7 @@ int aom_rb_read_bit(struct aom_read_bit_buffer *rb) { rb->bit_offset = off + 1; return bit; } else { - rb->error_handler(rb->error_handler_data); + if (rb->error_handler) rb->error_handler(rb->error_handler_data); return 0; } } diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h index 6e3fac260..588e47bf3 100644 --- a/third_party/aom/aom_dsp/bitwriter.h +++ b/third_party/aom/aom_dsp/bitwriter.h @@ -14,16 +14,11 @@ #include #include "./aom_config.h" -#if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL -#error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL" -#endif #if CONFIG_ANS #include "aom_dsp/buf_ans.h" -#elif CONFIG_DAALA_EC -#include "aom_dsp/daalaboolwriter.h" #else -#include "aom_dsp/dkboolwriter.h" +#include "aom_dsp/daalaboolwriter.h" #endif #include "aom_dsp/prob.h" @@ -38,10 +33,8 @@ extern "C" { #if CONFIG_ANS typedef struct BufAnsCoder aom_writer; -#elif CONFIG_DAALA_EC -typedef struct daala_writer aom_writer; #else -typedef struct aom_dk_writer aom_writer; +typedef struct daala_writer aom_writer; #endif typedef struct TOKEN_STATS { @@ -72,10 +65,8 @@ static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) { (void)bc; (void)buffer; assert(0 && "buf_ans requires a more complicated startup procedure"); -#elif CONFIG_DAALA_EC - aom_daala_start_encode(bc, buffer); #else - aom_dk_start_encode(bc, buffer); + aom_daala_start_encode(bc, buffer); #endif } @@ -83,20 +74,16 @@ static INLINE void aom_stop_encode(aom_writer *bc) { #if CONFIG_ANS (void)bc; assert(0 && "buf_ans requires a more complicated shutdown procedure"); -#elif CONFIG_DAALA_EC - aom_daala_stop_encode(bc); #else - aom_dk_stop_encode(bc); + aom_daala_stop_encode(bc); #endif } static INLINE void aom_write(aom_writer *br, int bit, int probability) { #if CONFIG_ANS buf_rabs_write(br, bit, probability); -#elif CONFIG_DAALA_EC - aom_daala_write(br, bit, probability); #else - aom_dk_write(br, bit, probability); + aom_daala_write(br, bit, probability); #endif } @@ -113,7 +100,7 @@ static INLINE void aom_write_record(aom_writer *br, int bit, int probability, static INLINE void aom_write_bit(aom_writer *w, int bit) { #if CONFIG_ANS buf_rabs_write_bit(w, bit); -#elif CONFIG_DAALA_EC && CONFIG_RAWBITS +#elif CONFIG_RAWBITS // Note this uses raw bits and is not the same as aom_daala_write(r, 128); aom_daala_write_bit(w, bit); #else @@ -137,28 +124,6 @@ static INLINE void aom_write_literal(aom_writer *w, int data, int bits) { for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit)); } -static INLINE void aom_write_tree_as_bits(aom_writer *w, - const aom_tree_index *tr, - const aom_prob *probs, int bits, - int len, aom_tree_index i) { - do { - const int bit = (bits >> --len) & 1; - aom_write(w, bit, probs[i >> 1]); - i = tr[i + bit]; - } while (len); -} - -static INLINE void aom_write_tree_as_bits_record( - aom_writer *w, const aom_tree_index *tr, const aom_prob *probs, int bits, - int len, aom_tree_index i, TOKEN_STATS *token_stats) { - do { - const int bit = (bits >> --len) & 1; - aom_write_record(w, bit, probs[i >> 1], token_stats); - i = tr[i + bit]; - } while (len); -} - -#if CONFIG_EC_MULTISYMBOL static INLINE void aom_write_cdf(aom_writer *w, int symb, const aom_cdf_prob *cdf, int nsymbs) { #if CONFIG_ANS @@ -167,12 +132,8 @@ static INLINE void aom_write_cdf(aom_writer *w, int symb, const aom_cdf_prob cum_prob = symb > 0 ? cdf[symb - 1] : 0; const aom_cdf_prob prob = cdf[symb] - cum_prob; buf_rans_write(w, cum_prob, prob); -#elif CONFIG_DAALA_EC - daala_write_symbol(w, symb, cdf, nsymbs); #else -#error \ - "CONFIG_EC_MULTISYMBOL is selected without a valid backing entropy " \ - "coder. Enable daala_ec or ans for a valid configuration." + daala_write_symbol(w, symb, cdf, nsymbs); #endif } @@ -223,16 +184,10 @@ static INLINE void aom_write_tree_as_cdf(aom_writer *w, } while (len); } -#endif // CONFIG_EC_MULTISYMBOL - static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree, const aom_prob *probs, int bits, int len, aom_tree_index i) { -#if CONFIG_EC_MULTISYMBOL aom_write_tree_as_cdf(w, tree, probs, bits, len, i); -#else - aom_write_tree_as_bits(w, tree, probs, bits, len, i); -#endif } static INLINE void aom_write_tree_record(aom_writer *w, @@ -240,12 +195,8 @@ static INLINE void aom_write_tree_record(aom_writer *w, const aom_prob *probs, int bits, int len, aom_tree_index i, TOKEN_STATS *token_stats) { -#if CONFIG_EC_MULTISYMBOL (void)token_stats; aom_write_tree_as_cdf(w, tree, probs, bits, len, i); -#else - aom_write_tree_as_bits_record(w, tree, probs, bits, len, i, token_stats); -#endif } #ifdef __cplusplus diff --git a/third_party/aom/aom_dsp/dkboolreader.c b/third_party/aom/aom_dsp/dkboolreader.c deleted file mode 100644 index 288d5f1ce..000000000 --- a/third_party/aom/aom_dsp/dkboolreader.c +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_config.h" - -#include "aom_dsp/dkboolreader.h" -#include "aom_dsp/prob.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" -#include "aom_mem/aom_mem.h" -#include "aom_util/endian_inl.h" - -static INLINE int aom_dk_read_bit(struct aom_dk_reader *r) { - return aom_dk_read(r, 128); // aom_prob_half -} - -int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer, - size_t size, aom_decrypt_cb decrypt_cb, - void *decrypt_state) { - if (size && !buffer) { - return 1; - } else { - r->buffer_end = buffer + size; - r->buffer_start = r->buffer = buffer; - r->value = 0; - r->count = -8; - r->range = 255; - r->decrypt_cb = decrypt_cb; - r->decrypt_state = decrypt_state; - aom_dk_reader_fill(r); -#if CONFIG_ACCOUNTING - r->accounting = NULL; -#endif - return aom_dk_read_bit(r) != 0; // marker bit - } -} - -void aom_dk_reader_fill(struct aom_dk_reader *r) { - const uint8_t *const buffer_end = r->buffer_end; - const uint8_t *buffer = r->buffer; - const uint8_t *buffer_start = buffer; - BD_VALUE value = r->value; - int count = r->count; - const size_t bytes_left = buffer_end - buffer; - const size_t bits_left = bytes_left * CHAR_BIT; - int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT); - - if (r->decrypt_cb) { - size_t n = AOMMIN(sizeof(r->clear_buffer), bytes_left); - r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n); - buffer = r->clear_buffer; - buffer_start = r->clear_buffer; - } - if (bits_left > BD_VALUE_SIZE) { - const int bits = (shift & 0xfffffff8) + CHAR_BIT; - BD_VALUE nv; - BD_VALUE big_endian_values; - memcpy(&big_endian_values, buffer, sizeof(BD_VALUE)); -#if SIZE_MAX == 0xffffffffffffffffULL - big_endian_values = HToBE64(big_endian_values); -#else - big_endian_values = HToBE32(big_endian_values); -#endif - nv = big_endian_values >> (BD_VALUE_SIZE - bits); - count += bits; - buffer += (bits >> 3); - value = r->value | (nv << (shift & 0x7)); - } else { - const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left); - int loop_end = 0; - if (bits_over >= 0) { - count += LOTS_OF_BITS; - loop_end = bits_over; - } - - if (bits_over < 0 || bits_left) { - while (shift >= loop_end) { - count += CHAR_BIT; - value |= (BD_VALUE)*buffer++ << shift; - shift -= CHAR_BIT; - } - } - } - - // NOTE: Variable 'buffer' may not relate to 'r->buffer' after decryption, - // so we increase 'r->buffer' by the amount that 'buffer' moved, rather than - // assign 'buffer' to 'r->buffer'. - r->buffer += buffer - buffer_start; - r->value = value; - r->count = count; -} - -const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r) { - // Find the end of the coded buffer - while (r->count > CHAR_BIT && r->count < BD_VALUE_SIZE) { - r->count -= CHAR_BIT; - r->buffer--; - } - return r->buffer; -} diff --git a/third_party/aom/aom_dsp/dkboolreader.h b/third_party/aom/aom_dsp/dkboolreader.h deleted file mode 100644 index f0bc84381..000000000 --- a/third_party/aom/aom_dsp/dkboolreader.h +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_DKBOOLREADER_H_ -#define AOM_DSP_DKBOOLREADER_H_ - -#include -#include -#include - -#include "./aom_config.h" -#if CONFIG_BITSTREAM_DEBUG -#include -#include -#include "aom_util/debug_util.h" -#endif // CONFIG_BITSTREAM_DEBUG - -#include "aom_ports/mem.h" -#include "aom/aomdx.h" -#include "aom/aom_integer.h" -#include "aom_dsp/prob.h" -#if CONFIG_ACCOUNTING -#include "av1/decoder/accounting.h" -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef size_t BD_VALUE; - -#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT) - -// This is meant to be a large, positive constant that can still be efficiently -// loaded as an immediate (on platforms like ARM, for example). -// Even relatively modest values like 100 would work fine. -#define LOTS_OF_BITS 0x40000000 - -struct aom_dk_reader { - // Be careful when reordering this struct, it may impact the cache negatively. - BD_VALUE value; - unsigned int range; - int count; - const uint8_t *buffer_start; - const uint8_t *buffer_end; - const uint8_t *buffer; - aom_decrypt_cb decrypt_cb; - void *decrypt_state; - uint8_t clear_buffer[sizeof(BD_VALUE) + 1]; -#if CONFIG_ACCOUNTING - Accounting *accounting; -#endif -}; - -int aom_dk_reader_init(struct aom_dk_reader *r, const uint8_t *buffer, - size_t size, aom_decrypt_cb decrypt_cb, - void *decrypt_state); - -void aom_dk_reader_fill(struct aom_dk_reader *r); - -const uint8_t *aom_dk_reader_find_end(struct aom_dk_reader *r); - -static INLINE uint32_t aom_dk_reader_tell(const struct aom_dk_reader *r) { - const uint32_t bits_read = - (uint32_t)((r->buffer - r->buffer_start) * CHAR_BIT); - const int count = - (r->count < LOTS_OF_BITS) ? r->count : r->count - LOTS_OF_BITS; - assert(r->buffer >= r->buffer_start); - return bits_read - (count + CHAR_BIT); -} - -/*The resolution of fractional-precision bit usage measurements, i.e., - 3 => 1/8th bits.*/ -#define DK_BITRES (3) - -static INLINE uint32_t aom_dk_reader_tell_frac(const struct aom_dk_reader *r) { - uint32_t num_bits; - uint32_t range; - int l; - int i; - num_bits = aom_dk_reader_tell(r) << DK_BITRES; - range = r->range; - l = 0; - for (i = DK_BITRES; i-- > 0;) { - int b; - range = range * range >> 7; - b = (int)(range >> 8); - l = l << 1 | b; - range >>= b; - } - return num_bits - l; -} - -static INLINE int aom_dk_reader_has_error(struct aom_dk_reader *r) { - // Check if we have reached the end of the buffer. - // - // Variable 'count' stores the number of bits in the 'value' buffer, minus - // 8. The top byte is part of the algorithm, and the remainder is buffered - // to be shifted into it. So if count == 8, the top 16 bits of 'value' are - // occupied, 8 for the algorithm and 8 in the buffer. - // - // When reading a byte from the user's buffer, count is filled with 8 and - // one byte is filled into the value buffer. When we reach the end of the - // data, count is additionally filled with LOTS_OF_BITS. So when - // count == LOTS_OF_BITS - 1, the user's data has been exhausted. - // - // 1 if we have tried to decode bits after the end of stream was encountered. - // 0 No error. - return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS; -} - -static INLINE int aom_dk_read(struct aom_dk_reader *r, int prob) { - unsigned int bit = 0; - BD_VALUE value; - BD_VALUE bigsplit; - int count; - unsigned int range; - unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT; - - if (r->count < 0) aom_dk_reader_fill(r); - - value = r->value; - count = r->count; - - bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT); - - range = split; - - if (value >= bigsplit) { - range = r->range - split; - value = value - bigsplit; - bit = 1; - } - - { - register int shift = aom_norm[range]; - range <<= shift; - value <<= shift; - count -= shift; - } - r->value = value; - r->count = count; - r->range = range; - -#if CONFIG_BITSTREAM_DEBUG - { - int ref_bit, ref_prob; - const int queue_r = bitstream_queue_get_read(); - const int frame_idx = bitstream_queue_get_frame_read(); - bitstream_queue_pop(&ref_bit, &ref_prob); - if (prob != ref_prob) { - fprintf( - stderr, - "\n *** prob error, frame_idx_r %d prob %d ref_prob %d queue_r %d\n", - frame_idx, prob, ref_prob, queue_r); - assert(0); - } - if ((int)bit != ref_bit) { - fprintf(stderr, "\n *** bit error, frame_idx_r %d bit %d ref_bit %d\n", - frame_idx, bit, ref_bit); - assert(0); - } - } -#endif // CONFIG_BITSTREAM_DEBUG - - return bit; -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_DSP_DKBOOLREADER_H_ diff --git a/third_party/aom/aom_dsp/dkboolwriter.c b/third_party/aom/aom_dsp/dkboolwriter.c deleted file mode 100644 index fc98e7c9b..000000000 --- a/third_party/aom/aom_dsp/dkboolwriter.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./dkboolwriter.h" - -static INLINE void aom_dk_write_bit(aom_dk_writer *w, int bit) { - aom_dk_write(w, bit, 128); // aom_prob_half -} - -void aom_dk_start_encode(aom_dk_writer *br, uint8_t *source) { - br->lowvalue = 0; - br->range = 255; - br->count = -24; - br->buffer = source; - br->pos = 0; - aom_dk_write_bit(br, 0); -} - -void aom_dk_stop_encode(aom_dk_writer *br) { - int i; - -#if CONFIG_BITSTREAM_DEBUG - bitstream_queue_set_skip_write(1); -#endif // CONFIG_BITSTREAM_DEBUG - - for (i = 0; i < 32; i++) aom_dk_write_bit(br, 0); - -#if CONFIG_BITSTREAM_DEBUG - bitstream_queue_set_skip_write(0); -#endif // CONFIG_BITSTREAM_DEBUG - - // Ensure there's no ambigous collision with any index marker bytes - if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0; -} diff --git a/third_party/aom/aom_dsp/dkboolwriter.h b/third_party/aom/aom_dsp/dkboolwriter.h deleted file mode 100644 index 835436885..000000000 --- a/third_party/aom/aom_dsp/dkboolwriter.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_DKBOOLWRITER_H_ -#define AOM_DSP_DKBOOLWRITER_H_ - -#include "./aom_config.h" - -#if CONFIG_BITSTREAM_DEBUG -#include -#include "aom_util/debug_util.h" -#endif // CONFIG_BITSTREAM_DEBUG - -#include "aom_dsp/prob.h" -#include "aom_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct aom_dk_writer { - unsigned int lowvalue; - unsigned int range; - int count; - unsigned int pos; - uint8_t *buffer; -} aom_dk_writer; - -void aom_dk_start_encode(aom_dk_writer *bc, uint8_t *buffer); -void aom_dk_stop_encode(aom_dk_writer *bc); - -static INLINE void aom_dk_write(aom_dk_writer *br, int bit, int probability) { - unsigned int split; - int count = br->count; - unsigned int range = br->range; - unsigned int lowvalue = br->lowvalue; - register int shift; - -#if CONFIG_BITSTREAM_DEBUG - // int queue_r = 0; - // int frame_idx_r = 0; - // int queue_w = bitstream_queue_get_write(); - // int frame_idx_w = bitstream_queue_get_frame_write(); - // if (frame_idx_w == frame_idx_r && queue_w == queue_r) { - // fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n", - // frame_idx_w, queue_w); - // } - bitstream_queue_push(bit, probability); -#endif // CONFIG_BITSTREAM_DEBUG - - split = 1 + (((range - 1) * probability) >> 8); - - range = split; - - if (bit) { - lowvalue += split; - range = br->range - split; - } - - shift = aom_norm[range]; - - range <<= shift; - count += shift; - - if (count >= 0) { - int offset = shift - count; - - if ((lowvalue << (offset - 1)) & 0x80000000) { - int x = br->pos - 1; - - while (x >= 0 && br->buffer[x] == 0xff) { - br->buffer[x] = 0; - x--; - } - - br->buffer[x] += 1; - } - - br->buffer[br->pos++] = (lowvalue >> (24 - offset)); - lowvalue <<= offset; - shift = count; - lowvalue &= 0xffffff; - count -= 8; - } - - lowvalue <<= shift; - br->count = count; - br->lowvalue = lowvalue; - br->range = range; -} - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_DSP_DKBOOLWRITER_H_ diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c index 1f0870b64..370d0374b 100644 --- a/third_party/aom/aom_dsp/intrapred.c +++ b/third_party/aom/aom_dsp/intrapred.c @@ -208,33 +208,30 @@ static const int sm_weight_log2_scale = 8; #if CONFIG_TX64X64 // max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST]) #define MAX_BLOCK_DIM 64 -#define NUM_BLOCK_DIMS 6 // log2(MAX_BLOCK_DIM) #else #define MAX_BLOCK_DIM 32 -#define NUM_BLOCK_DIMS 5 #endif // CONFIG_TX64X64 -static const uint8_t sm_weight_arrays[NUM_BLOCK_DIMS][MAX_BLOCK_DIM] = { +static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { + // Unused, because we always offset by bs, which is at least 2. + 0, 0, // bs = 2 - { 255, 128 }, + 255, 128, // bs = 4 - { 255, 149, 85, 64 }, + 255, 149, 85, 64, // bs = 8 - { 255, 197, 146, 105, 73, 50, 37, 32 }, + 255, 197, 146, 105, 73, 50, 37, 32, // bs = 16 - { 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16 }, + 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, // bs = 32 - { - 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, - 111, 101, 92, 83, 74, 66, 59, 52, 45, 39, 34, - 29, 25, 21, 17, 14, 12, 10, 9, 8, 8 }, + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, #if CONFIG_TX64X64 // bs = 64 - { 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, - 163, 156, 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, - 91, 86, 82, 77, 73, 69, 65, 61, 57, 54, 50, 47, 44, - 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, 13, - 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4 }, + 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, + 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, + 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, + 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4, #endif // CONFIG_TX64X64 }; @@ -250,10 +247,7 @@ static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { const uint8_t below_pred = left[bs - 1]; // estimated by bottom-left pixel const uint8_t right_pred = above[bs - 1]; // estimated by top-right pixel - const int arr_index = get_msb(bs) - 1; - assert(arr_index >= 0); - assert(arr_index < NUM_BLOCK_DIMS); - const uint8_t *const sm_weights = sm_weight_arrays[arr_index]; + const uint8_t *const sm_weights = sm_weight_arrays + bs; // scale = 2 * 2^sm_weight_log2_scale const int log2_scale = 1 + sm_weight_log2_scale; const uint16_t scale = (1 << sm_weight_log2_scale); @@ -277,6 +271,64 @@ static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } } +#if CONFIG_SMOOTH_HV +static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + const uint8_t below_pred = left[bs - 1]; // estimated by bottom-left pixel + const uint8_t *const sm_weights = sm_weight_arrays + bs; + // scale = 2^sm_weight_log2_scale + const int log2_scale = sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bs; r++) { + int c; + for (c = 0; c < bs; ++c) { + const uint8_t pixels[] = { above[c], below_pred }; + const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[r]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = clip_pixel(divide_round(this_pred, log2_scale)); + } + dst += stride; + } +} + +static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + const uint8_t right_pred = above[bs - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights = sm_weight_arrays + bs; + // scale = 2^sm_weight_log2_scale + const int log2_scale = sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bs; r++) { + int c; + for (c = 0; c < bs; ++c) { + const uint8_t pixels[] = { left[r], right_pred }; + const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[c]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = clip_pixel(divide_round(this_pred, log2_scale)); + } + dst += stride; + } +} +#endif // CONFIG_SMOOTH_HV + #else static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs, @@ -743,10 +795,7 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int bd) { const uint16_t below_pred = left[bs - 1]; // estimated by bottom-left pixel const uint16_t right_pred = above[bs - 1]; // estimated by top-right pixel - const int arr_index = get_msb(bs) - 1; - assert(arr_index >= 0); - assert(arr_index < NUM_BLOCK_DIMS); - const uint8_t *const sm_weights = sm_weight_arrays[arr_index]; + const uint8_t *const sm_weights = sm_weight_arrays + bs; // scale = 2 * 2^sm_weight_log2_scale const int log2_scale = 1 + sm_weight_log2_scale; const uint16_t scale = (1 << sm_weight_log2_scale); @@ -770,6 +819,64 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, } } +#if CONFIG_SMOOTH_HV +static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t below_pred = left[bs - 1]; // estimated by bottom-left pixel + const uint8_t *const sm_weights = sm_weight_arrays + bs; + // scale = 2^sm_weight_log2_scale + const int log2_scale = sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bs; r++) { + int c; + for (c = 0; c < bs; ++c) { + const uint16_t pixels[] = { above[c], below_pred }; + const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[r]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd); + } + dst += stride; + } +} + +static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + const uint16_t right_pred = above[bs - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights = sm_weight_arrays + bs; + // scale = 2^sm_weight_log2_scale + const int log2_scale = sm_weight_log2_scale; + const uint16_t scale = (1 << sm_weight_log2_scale); + sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + + int r; + for (r = 0; r < bs; r++) { + int c; + for (c = 0; c < bs; ++c) { + const uint16_t pixels[] = { left[r], right_pred }; + const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; + uint32_t this_pred = 0; + assert(scale >= sm_weights[c]); + int i; + for (i = 0; i < 2; ++i) { + this_pred += weights[i] * pixels[i]; + } + dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd); + } + dst += stride; + } +} +#endif + #else static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, @@ -879,6 +986,7 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, intra_pred_sized(type, 16) \ intra_pred_sized(type, 32) \ intra_pred_sized(type, 64) \ + intra_pred_highbd_sized(type, 2) \ intra_pred_highbd_sized(type, 4) \ intra_pred_highbd_sized(type, 8) \ intra_pred_highbd_sized(type, 16) \ @@ -958,8 +1066,12 @@ intra_pred_above_4x4(d153) intra_pred_allsizes(v) intra_pred_allsizes(h) #if CONFIG_ALT_INTRA -intra_pred_allsizes(paeth) intra_pred_allsizes(smooth) +#if CONFIG_SMOOTH_HV +intra_pred_allsizes(smooth_v) +intra_pred_allsizes(smooth_h) +#endif // CONFIG_SMOOTH_HV +intra_pred_allsizes(paeth) #else intra_pred_allsizes(tm) #endif // CONFIG_ALT_INTRA diff --git a/third_party/aom/aom_dsp/inv_txfm.c b/third_party/aom/aom_dsp/inv_txfm.c index bb995856a..6e7d8c928 100644 --- a/third_party/aom/aom_dsp/inv_txfm.c +++ b/third_party/aom/aom_dsp/inv_txfm.c @@ -1442,4 +1442,868 @@ void aom_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, } } +void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[8], step2[8]; + tran_high_t temp1, temp2; + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 2 & stage 3 - even half + aom_highbd_idct4_c(step1, step1, bd); + + // stage 2 - odd half + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); + + // stage 3 - odd half + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[7] = step2[7]; + + // stage 4 + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); +} + +void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + (void)bd; + + if (!(x0 | x1 | x2 | x3)) { + memset(output, 0, 4 * sizeof(*output)); + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd); + output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd); + output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); +} + +void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + + tran_low_t x0 = input[7]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[5]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[3]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[1]; + tran_low_t x7 = input[6]; + (void)bd; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + memset(output, 0, 8 * sizeof(*output)); + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x4, bd); + output[2] = HIGHBD_WRAPLOW(x6, bd); + output[3] = HIGHBD_WRAPLOW(-x2, bd); + output[4] = HIGHBD_WRAPLOW(x3, bd); + output[5] = HIGHBD_WRAPLOW(-x7, bd); + output[6] = HIGHBD_WRAPLOW(x5, bd); + output[7] = HIGHBD_WRAPLOW(-x1, bd); +} + +void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[16], step2[16]; + tran_high_t temp1, temp2; + (void)bd; + + // stage 1 + step1[0] = input[0 / 2]; + step1[1] = input[16 / 2]; + step1[2] = input[8 / 2]; + step1[3] = input[24 / 2]; + step1[4] = input[4 / 2]; + step1[5] = input[20 / 2]; + step1[6] = input[12 / 2]; + step1[7] = input[28 / 2]; + step1[8] = input[2 / 2]; + step1[9] = input[18 / 2]; + step1[10] = input[10 / 2]; + step1[11] = input[26 / 2]; + step1[12] = input[6 / 2]; + step1[13] = input[22 / 2]; + step1[14] = input[14 / 2]; + step1[15] = input[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); + step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); + step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); + step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); + step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[7] = step2[7]; + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); + step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); + step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); + + // stage 6 + step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); + output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); + output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); + output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); + output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); + output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); + output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); + output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); + output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); + output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); + output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); + output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); + output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); + output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); + output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); + output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); +} + +void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + + tran_low_t x0 = input[15]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[13]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[11]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[9]; + tran_low_t x7 = input[6]; + tran_low_t x8 = input[7]; + tran_low_t x9 = input[8]; + tran_low_t x10 = input[5]; + tran_low_t x11 = input[10]; + tran_low_t x12 = input[3]; + tran_low_t x13 = input[12]; + tran_low_t x14 = input[1]; + tran_low_t x15 = input[14]; + (void)bd; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + memset(output, 0, 16 * sizeof(*output)); + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = HIGHBD_WRAPLOW(s0 + s4, bd); + x1 = HIGHBD_WRAPLOW(s1 + s5, bd); + x2 = HIGHBD_WRAPLOW(s2 + s6, bd); + x3 = HIGHBD_WRAPLOW(s3 + s7, bd); + x4 = HIGHBD_WRAPLOW(s0 - s4, bd); + x5 = HIGHBD_WRAPLOW(s1 - s5, bd); + x6 = HIGHBD_WRAPLOW(s2 - s6, bd); + x7 = HIGHBD_WRAPLOW(s3 - s7, bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); + x8 = HIGHBD_WRAPLOW(s8 + s10, bd); + x9 = HIGHBD_WRAPLOW(s9 + s11, bd); + x10 = HIGHBD_WRAPLOW(s8 - s10, bd); + x11 = HIGHBD_WRAPLOW(s9 - s11, bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); + + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x8, bd); + output[2] = HIGHBD_WRAPLOW(x12, bd); + output[3] = HIGHBD_WRAPLOW(-x4, bd); + output[4] = HIGHBD_WRAPLOW(x6, bd); + output[5] = HIGHBD_WRAPLOW(x14, bd); + output[6] = HIGHBD_WRAPLOW(x10, bd); + output[7] = HIGHBD_WRAPLOW(x2, bd); + output[8] = HIGHBD_WRAPLOW(x3, bd); + output[9] = HIGHBD_WRAPLOW(x11, bd); + output[10] = HIGHBD_WRAPLOW(x15, bd); + output[11] = HIGHBD_WRAPLOW(x7, bd); + output[12] = HIGHBD_WRAPLOW(x5, bd); + output[13] = HIGHBD_WRAPLOW(-x13, bd); + output[14] = HIGHBD_WRAPLOW(x9, bd); + output[15] = HIGHBD_WRAPLOW(-x1, bd); +} + +void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_low_t step1[32], step2[32]; + tran_high_t temp1, temp2; + (void)bd; + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); + step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); + step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd); + step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd); + step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd); + step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd); + step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd); + step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd); + step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd); + step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd); + step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd); + step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd); + step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd); + step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd); + step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd); + step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); + step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); + step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); + step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); + step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); + step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); + step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd); + step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd); + step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd); + step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd); + step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd); + step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd); + step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd); + step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd); + + step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd); + step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd); + step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd); + step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd); + step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd); + step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd); + step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd); + step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd); + + // stage 5 + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); + step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); + step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[7] = step2[7]; + + step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); + step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); + step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); + step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); + step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); + step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); + step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); + step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); + step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); + step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); + step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); + step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd); + step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd); + step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd); + step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd); + step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd); + step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd); + step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd); + step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd); + + step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd); + step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd); + step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd); + step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd); + step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd); + step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd); + step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd); + step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd); + + // stage 7 + step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); + step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); + step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); + step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); + step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); + step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); + step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); + step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); + step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); + step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); + step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); + step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); + step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); + step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); + step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); + step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd); + output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd); + output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd); + output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd); + output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd); + output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd); + output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd); + output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd); + output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd); + output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd); + output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd); + output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd); + output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd); + output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd); + output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd); + output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd); + output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd); + output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd); + output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd); + output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd); + output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd); + output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd); + output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd); + output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd); + output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd); + output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd); + output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd); + output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd); + output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd); + output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd); + output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd); + output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); +} + #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c index e2e839219..7ea1e6b89 100644 --- a/third_party/aom/aom_dsp/loopfilter.c +++ b/third_party/aom/aom_dsp/loopfilter.c @@ -149,10 +149,15 @@ void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { #if !CONFIG_PARALLEL_DEBLOCKING const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -179,10 +184,15 @@ void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { #if !CONFIG_PARALLEL_DEBLOCKING const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; @@ -206,7 +216,7 @@ void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } -static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, +static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, uint8_t *oq3) { @@ -229,10 +239,15 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat, void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -256,8 +271,13 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = @@ -278,8 +298,8 @@ void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, } #if PARALLEL_DEBLOCKING_11_TAP -static INLINE void filter12(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint8_t *op5, uint8_t *op4, +static INLINE void filter12(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op5, uint8_t *op4, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, uint8_t *oq4, @@ -308,8 +328,8 @@ static INLINE void filter12(int8_t mask, uint8_t thresh, uint8_t flat, #endif #if PARALLEL_DEBLOCKING_9_TAP -static INLINE void filter10(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint8_t *op4, uint8_t *op3, +static INLINE void filter10(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op4, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, uint8_t *oq4) { @@ -332,8 +352,8 @@ static INLINE void filter10(int8_t mask, uint8_t thresh, uint8_t flat, } #endif -static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint8_t *op7, uint8_t *op6, +static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op7, uint8_t *op6, uint8_t *op5, uint8_t *op4, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, @@ -390,10 +410,15 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int step = 4; +#else + int step = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < step * count; ++i) { const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; @@ -436,7 +461,11 @@ void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit, void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); +#else mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); +#endif } static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, @@ -478,7 +507,11 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4); +#else mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); +#endif } void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, @@ -596,10 +629,15 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { #if !CONFIG_PARALLEL_DEBLOCKING const uint16_t p3 = s[-4 * p]; const uint16_t p2 = s[-3 * p]; @@ -636,10 +674,15 @@ void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { #if !CONFIG_PARALLEL_DEBLOCKING const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; @@ -665,7 +708,7 @@ void aom_highbd_lpf_vertical_4_dual_c( bd); } -static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat, +static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, int bd) { @@ -689,10 +732,15 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -718,8 +766,13 @@ void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = @@ -741,8 +794,8 @@ void aom_highbd_lpf_vertical_8_dual_c( bd); } -static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat, - uint8_t flat2, uint16_t *op7, uint16_t *op6, +static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint16_t *op7, uint16_t *op6, uint16_t *op5, uint16_t *op4, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, @@ -813,10 +866,15 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, const uint8_t *thresh, int count, int bd) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int step = 4; +#else + int step = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < step * count; ++i) { const uint16_t p3 = s[-4 * p]; const uint16_t p2 = s[-3 * p]; const uint16_t p1 = s[-2 * p]; @@ -852,7 +910,11 @@ void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +#else highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); +#endif } static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, @@ -888,13 +950,21 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd); +#else highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); +#endif } void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); +#else highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); +#endif } #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/mips/avg_msa.c b/third_party/aom/aom_dsp/mips/avg_msa.c deleted file mode 100644 index 0e1728155..000000000 --- a/third_party/aom/aom_dsp/mips/avg_msa.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/macros_msa.h" - -uint32_t aom_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { - uint32_t sum_out; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; - v4u32 sum = { 0 }; - - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); - HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); - ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); - ADD2(sum0, sum2, sum4, sum6, sum0, sum4); - sum0 += sum4; - - sum = __msa_hadd_u_w(sum0, sum0); - sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); - sum = __msa_hadd_u_w(sum0, sum0); - sum = (v4u32)__msa_srari_w((v4i32)sum, 6); - sum_out = __msa_copy_u_w((v4i32)sum, 0); - - return sum_out; -} - -uint32_t aom_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { - uint32_t sum_out; - uint32_t src0, src1, src2, src3; - v16u8 vec = { 0 }; - v8u16 sum0; - v4u32 sum1; - v2u64 sum2; - - LW4(src, src_stride, src0, src1, src2, src3); - INSERT_W4_UB(src0, src1, src2, src3, vec); - - sum0 = __msa_hadd_u_h(vec, vec); - sum1 = __msa_hadd_u_w(sum0, sum0); - sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); - sum1 = __msa_hadd_u_w(sum0, sum0); - sum2 = __msa_hadd_u_d(sum1, sum1); - sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); - sum_out = __msa_copy_u_w((v4i32)sum1, 0); - - return sum_out; -} diff --git a/third_party/aom/aom_dsp/prob.c b/third_party/aom/aom_dsp/prob.c index c60bfdac5..eefe7521f 100644 --- a/third_party/aom/aom_dsp/prob.c +++ b/third_party/aom/aom_dsp/prob.c @@ -11,25 +11,10 @@ #include "./aom_config.h" -#if CONFIG_EC_MULTISYMBOL #include -#endif #include "aom_dsp/prob.h" -const uint8_t aom_norm[256] = { - 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - static unsigned int tree_merge_probs_impl(unsigned int i, const aom_tree_index *tree, const aom_prob *pre_probs, @@ -53,7 +38,6 @@ void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs, tree_merge_probs_impl(0, tree, pre_probs, counts, probs); } -#if CONFIG_EC_MULTISYMBOL typedef struct tree_node tree_node; struct tree_node { @@ -233,4 +217,3 @@ void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree) { int stack_index = 0; tree_to_index(&stack_index, ind, inv, tree, 0, 0); } -#endif diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h index 808592923..ec6654ab7 100644 --- a/third_party/aom/aom_dsp/prob.h +++ b/third_party/aom/aom_dsp/prob.h @@ -20,7 +20,7 @@ #include "aom_ports/bitops.h" #include "aom_ports/mem.h" -#if CONFIG_DAALA_EC +#if !CONFIG_ANS #include "aom_dsp/entcode.h" #endif @@ -33,14 +33,12 @@ typedef uint8_t aom_prob; // TODO(negge): Rename this aom_prob once we remove vpxbool. typedef uint16_t aom_cdf_prob; -#if CONFIG_EC_MULTISYMBOL #define CDF_SIZE(x) ((x) + 1) -#endif #define CDF_PROB_BITS 15 #define CDF_PROB_TOP (1 << CDF_PROB_BITS) -#if CONFIG_DAALA_EC +#if !CONFIG_ANS #define AOM_ICDF OD_ICDF #else #define AOM_ICDF(x) (x) @@ -117,7 +115,6 @@ static INLINE aom_prob mode_mv_merge_probs(aom_prob pre_prob, void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs, const unsigned int *counts, aom_prob *probs); -#if CONFIG_EC_MULTISYMBOL int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs, aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind, int *pth, int *len); @@ -150,9 +147,6 @@ static INLINE void av1_tree_to_cdf(const aom_tree_index *tree, } while (0) void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree); -#endif - -DECLARE_ALIGNED(16, extern const uint8_t, aom_norm[256]); #if CONFIG_EC_ADAPT static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { @@ -165,7 +159,7 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { tmp = AOM_ICDF(tmp0); diff = ((CDF_PROB_TOP - (nsymbs << rate2)) >> rate) << rate; // Single loop (faster) -#if CONFIG_DAALA_EC && CONFIG_EC_SMALLMUL +#if !CONFIG_ANS && CONFIG_EC_SMALLMUL for (i = 0; i < nsymbs - 1; ++i, tmp -= tmp0) { tmp -= (i == val ? diff : 0); cdf[i] += ((tmp - cdf[i]) >> rate); diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c index 3e1070519..2cc172ba5 100644 --- a/third_party/aom/aom_dsp/sad.c +++ b/third_party/aom/aom_dsp/sad.c @@ -16,6 +16,7 @@ #include "aom/aom_integer.h" #include "aom_ports/mem.h" +#include "aom_dsp/blend.h" /* Sum the difference between every corresponding element of the buffers. */ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, @@ -311,15 +312,20 @@ highbd_sadMxNx4D(4, 4) #if CONFIG_AV1 && CONFIG_EXT_INTER static INLINE - unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b, + unsigned int masked_sad(const uint8_t *src, int src_stride, + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, int width, int height) { int y, x; unsigned int sad = 0; for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]); + for (x = 0; x < width; x++) { + const uint8_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); + sad += abs(pred - src[x]); + } + src += src_stride; a += a_stride; b += b_stride; m += m_stride; @@ -329,12 +335,17 @@ highbd_sadMxNx4D(4, 4) return sad; } -#define MASKSADMxN(m, n) \ - unsigned int aom_masked_sad##m##x##n##_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *msk, int msk_stride) { \ - return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, \ - n); \ +#define MASKSADMxN(m, n) \ + unsigned int aom_masked_sad##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \ + msk_stride, m, n); \ + else \ + return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \ + msk_stride, m, n); \ } /* clang-format off */ @@ -360,18 +371,24 @@ MASKSADMxN(4, 4) #if CONFIG_HIGHBITDEPTH static INLINE - unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride, + unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int width, int height) { int y, x; unsigned int sad = 0; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]); + for (x = 0; x < width; x++) { + const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); + sad += abs(pred - src[x]); + } + src += src_stride; a += a_stride; b += b_stride; m += m_stride; @@ -381,12 +398,17 @@ MASKSADMxN(4, 4) return sad; } -#define HIGHBD_MASKSADMXN(m, n) \ - unsigned int aom_highbd_masked_sad##m##x##n##_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *msk, int msk_stride) { \ - return highbd_masked_sad(src, src_stride, ref, ref_stride, msk, \ - msk_stride, m, n); \ +#define HIGHBD_MASKSADMXN(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_c( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return highbd_masked_sad(src8, src_stride, ref8, ref_stride, \ + second_pred8, m, msk, msk_stride, m, n); \ + else \ + return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \ + ref_stride, msk, msk_stride, m, n); \ } #if CONFIG_EXT_PARTITION diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h index ee2b683a4..5c0042d8c 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h @@ -60,7 +60,9 @@ SIMD_INLINE void v64_store_aligned(void *p, v64 a) { c_v64_store_aligned(p, a); } -SIMD_INLINE v64 v64_align(v64 a, v64 b, c) { return c_v64_align(a, b, c); } +SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { + return c_v64_align(a, b, c); +} SIMD_INLINE v64 v64_zero() { return c_v64_zero(); } SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); } diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c index 9fc0db783..79677c92f 100644 --- a/third_party/aom/aom_dsp/variance.c +++ b/third_party/aom/aom_dsp/variance.c @@ -18,6 +18,7 @@ #include "aom_dsp/variance.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/blend.h" uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride) { @@ -672,297 +673,215 @@ void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, #endif // CONFIG_HIGHBITDEPTH #if CONFIG_AV1 && CONFIG_EXT_INTER -void masked_variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, const uint8_t *m, int m_stride, int w, int h, - unsigned int *sse, int *sum) { +void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const uint8_t *mask, int mask_stride, + int invert_mask) { int i, j; - int64_t sum64 = 0; - uint64_t sse64 = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = (a[j] - b[j]) * (m[j]); - sum64 += diff; - sse64 += diff * diff; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + if (!invert_mask) + comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]); + else + comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]); } - - a += a_stride; - b += b_stride; - m += m_stride; + comp_pred += width; + pred += width; + ref += ref_stride; + mask += mask_stride; } - sum64 = (sum64 >= 0) ? sum64 : -sum64; - *sum = (int)ROUND_POWER_OF_TWO(sum64, 6); - *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12); } -#define MASK_VAR(W, H) \ - unsigned int aom_masked_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, unsigned int *sse) { \ - int sum; \ - masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \ - return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ +void aom_comp_mask_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i, j; + int stride = ref_stride << 3; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + if (!invert_mask) + comp_pred[j] = AOM_BLEND_A64(mask[j], ref[(j << 3)], pred[j]); + else + comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[(j << 3)]); + } + comp_pred += width; + pred += width; + ref += stride; + mask += mask_stride; } +} #define MASK_SUBPIX_VAR(W, H) \ unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ unsigned int *sse) { \ uint16_t fdata3[(H + 1) * W]; \ uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ bilinear_filters_2t[xoffset]); \ var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters_2t[yoffset]); \ \ - return aom_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, msk, \ - msk_stride, sse); \ + aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ + invert_mask); \ + return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ } -MASK_VAR(4, 4) MASK_SUBPIX_VAR(4, 4) - -MASK_VAR(4, 8) MASK_SUBPIX_VAR(4, 8) - -MASK_VAR(8, 4) MASK_SUBPIX_VAR(8, 4) - -MASK_VAR(8, 8) MASK_SUBPIX_VAR(8, 8) - -MASK_VAR(8, 16) MASK_SUBPIX_VAR(8, 16) - -MASK_VAR(16, 8) MASK_SUBPIX_VAR(16, 8) - -MASK_VAR(16, 16) MASK_SUBPIX_VAR(16, 16) - -MASK_VAR(16, 32) MASK_SUBPIX_VAR(16, 32) - -MASK_VAR(32, 16) MASK_SUBPIX_VAR(32, 16) - -MASK_VAR(32, 32) MASK_SUBPIX_VAR(32, 32) - -MASK_VAR(32, 64) MASK_SUBPIX_VAR(32, 64) - -MASK_VAR(64, 32) MASK_SUBPIX_VAR(64, 32) - -MASK_VAR(64, 64) MASK_SUBPIX_VAR(64, 64) - #if CONFIG_EXT_PARTITION -MASK_VAR(64, 128) MASK_SUBPIX_VAR(64, 128) - -MASK_VAR(128, 64) MASK_SUBPIX_VAR(128, 64) - -MASK_VAR(128, 128) MASK_SUBPIX_VAR(128, 128) #endif // CONFIG_EXT_PARTITION #if CONFIG_HIGHBITDEPTH -void highbd_masked_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m, - int m_stride, int w, int h, uint64_t *sse, - int64_t *sum) { +void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { int i, j; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - const int diff = (a[j] - b[j]) * (m[j]); - *sum += (int64_t)diff; - *sse += (int64_t)diff * diff; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + if (!invert_mask) + comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]); + else + comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]); } - - a += a_stride; - b += b_stride; - m += m_stride; + comp_pred += width; + pred += width; + ref += ref_stride; + mask += mask_stride; } - *sum = (*sum >= 0) ? *sum : -*sum; - *sum = ROUND_POWER_OF_TWO(*sum, 6); - *sse = ROUND_POWER_OF_TWO(*sse, 12); } -void highbd_masked_variance(const uint8_t *a8, int a_stride, const uint8_t *b8, - int b_stride, const uint8_t *m, int m_stride, int w, - int h, unsigned int *sse, int *sum) { - int64_t sum64; - uint64_t sse64; - highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, - &sse64, &sum64); - *sum = (int)sum64; - *sse = (unsigned int)sse64; -} - -void highbd_10_masked_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, int w, int h, - unsigned int *sse, int *sum) { - int64_t sum64; - uint64_t sse64; - highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, - &sse64, &sum64); - *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); -} - -void highbd_12_masked_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, int w, int h, - unsigned int *sse, int *sum) { - int64_t sum64; - uint64_t sse64; - highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h, - &sse64, &sum64); - *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); - *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); -} +void aom_highbd_comp_mask_upsampled_pred_c(uint16_t *comp_pred, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i, j; + int stride = ref_stride << 3; -#define HIGHBD_MASK_VAR(W, H) \ - unsigned int aom_highbd_masked_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, unsigned int *sse) { \ - int sum; \ - highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, \ - &sum); \ - return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ - } \ - \ - unsigned int aom_highbd_10_masked_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, unsigned int *sse) { \ - int sum; \ - int64_t var; \ - highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \ - sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ - } \ - \ - unsigned int aom_highbd_12_masked_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, unsigned int *sse) { \ - int sum; \ - int64_t var; \ - highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, \ - sse, &sum); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ - return (var >= 0) ? (uint32_t)var : 0; \ + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + if (!invert_mask) + comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j << 3], pred[j]); + else + comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j << 3]); + } + comp_pred += width; + pred += width; + ref += stride; + mask += mask_stride; } +} -#define HIGHBD_MASK_SUBPIX_VAR(W, H) \ - unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - return aom_highbd_masked_variance##W##x##H##_c( \ - CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ - } \ - \ - unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - return aom_highbd_10_masked_variance##W##x##H##_c( \ - CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ - } \ - \ - unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - return aom_highbd_12_masked_variance##W##x##H##_c( \ - CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \ +#define HIGHBD_MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ + invert_mask); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref, ref_stride, sse); \ + } \ + \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ + invert_mask); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref, ref_stride, sse); \ + } \ + \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ + invert_mask); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + ref, ref_stride, sse); \ } -HIGHBD_MASK_VAR(4, 4) HIGHBD_MASK_SUBPIX_VAR(4, 4) - -HIGHBD_MASK_VAR(4, 8) HIGHBD_MASK_SUBPIX_VAR(4, 8) - -HIGHBD_MASK_VAR(8, 4) HIGHBD_MASK_SUBPIX_VAR(8, 4) - -HIGHBD_MASK_VAR(8, 8) HIGHBD_MASK_SUBPIX_VAR(8, 8) - -HIGHBD_MASK_VAR(8, 16) HIGHBD_MASK_SUBPIX_VAR(8, 16) - -HIGHBD_MASK_VAR(16, 8) HIGHBD_MASK_SUBPIX_VAR(16, 8) - -HIGHBD_MASK_VAR(16, 16) HIGHBD_MASK_SUBPIX_VAR(16, 16) - -HIGHBD_MASK_VAR(16, 32) HIGHBD_MASK_SUBPIX_VAR(16, 32) - -HIGHBD_MASK_VAR(32, 16) HIGHBD_MASK_SUBPIX_VAR(32, 16) - -HIGHBD_MASK_VAR(32, 32) HIGHBD_MASK_SUBPIX_VAR(32, 32) - -HIGHBD_MASK_VAR(32, 64) HIGHBD_MASK_SUBPIX_VAR(32, 64) - -HIGHBD_MASK_VAR(64, 32) HIGHBD_MASK_SUBPIX_VAR(64, 32) - -HIGHBD_MASK_VAR(64, 64) HIGHBD_MASK_SUBPIX_VAR(64, 64) - #if CONFIG_EXT_PARTITION -HIGHBD_MASK_VAR(64, 128) HIGHBD_MASK_SUBPIX_VAR(64, 128) - -HIGHBD_MASK_VAR(128, 64) HIGHBD_MASK_SUBPIX_VAR(128, 64) - -HIGHBD_MASK_VAR(128, 128) HIGHBD_MASK_SUBPIX_VAR(128, 128) #endif // CONFIG_EXT_PARTITION #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h index 7c925cfac..20f0895cb 100644 --- a/third_party/aom/aom_dsp/variance.h +++ b/third_party/aom/aom_dsp/variance.h @@ -57,15 +57,13 @@ typedef unsigned int (*aom_subp_avg_variance_fn_t)( #if CONFIG_AV1 && CONFIG_EXT_INTER typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, - const uint8_t *msk_ptr, - int msk_stride); -typedef unsigned int (*aom_masked_variance_fn_t)( - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, - const uint8_t *msk, int msk_stride, unsigned int *sse); + const uint8_t *second_pred, + const uint8_t *msk, int msk_stride, + int invert_mask); typedef unsigned int (*aom_masked_subpixvariance_fn_t)( const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse); + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, + const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); #endif // CONFIG_AV1 && CONFIG_EXT_INTER #if CONFIG_AV1 && CONFIG_MOTION_VAR @@ -94,7 +92,6 @@ typedef struct aom_variance_vtable { aom_sad_multi_d_fn_t sdx4df; #if CONFIG_EXT_INTER aom_masked_sad_fn_t msdf; - aom_masked_variance_fn_t mvf; aom_masked_subpixvariance_fn_t msvf; #endif // CONFIG_EXT_INTER #if CONFIG_MOTION_VAR diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c b/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c new file mode 100644 index 000000000..14352895d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" + +void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + const int bd = 8; + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + (void)x_step_q4; + (void)y_step_q4; + + uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]; + int intermediate_height = h + SUBPEL_TAPS - 1; + int i, j; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero = _mm_setzero_si128(); + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); + + /* Horizontal filter */ + { + const __m128i coeffs_x = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) + + (1 << (bd + FILTER_BITS - 1))); + + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + + // Filter even-index pixels + const __m128i src_0 = _mm_unpacklo_epi8(data, zero); + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), + FILTER_BITS - EXTRAPREC_BITS); + + // Filter odd-index pixels + const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), + FILTER_BITS - EXTRAPREC_BITS); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + __m128i res = _mm_packs_epi32(res_even, res_odd); + res = _mm_min_epi16(_mm_max_epi16(res, zero), + _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1)); + _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); + } + } + } + + /* Vertical filter */ + { + const __m128i coeffs_y = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) - + (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1))); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS); + + const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); + + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storel_epi64(p, res_8bit); + } + } + } +} diff --git a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c b/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c new file mode 100644 index 000000000..74ce80e50 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/aom_convolve.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" + +#if EXTRAPREC_BITS > 2 +#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2" +#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)" +#endif + +void aom_highbd_convolve8_add_src_hip_ssse3( + const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { + assert(x_step_q4 == 16 && y_step_q4 == 16); + assert(!(w & 7)); + (void)x_step_q4; + (void)y_step_q4; + + const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); + uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); + + uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]; + int intermediate_height = h + SUBPEL_TAPS - 1; + int i, j; + const int center_tap = ((SUBPEL_TAPS - 1) / 2); + const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; + + const __m128i zero = _mm_setzero_si128(); + // Add an offset to account for the "add_src" part of the convolve function. + const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); + + /* Horizontal filter */ + { + const __m128i coeffs_x = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) + + (1 << (bd + FILTER_BITS - 1))); + + for (i = 0; i < intermediate_height; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i data = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i data2 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); + + // Filter even-index pixels + const __m128i res_0 = _mm_madd_epi16(data, coeff_01); + const __m128i res_2 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); + const __m128i res_4 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); + const __m128i res_6 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); + + __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), + _mm_add_epi32(res_2, res_6)); + res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), + FILTER_BITS - EXTRAPREC_BITS); + + // Filter odd-index pixels + const __m128i res_1 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); + const __m128i res_3 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); + const __m128i res_5 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); + const __m128i res_7 = + _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); + + __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), + _mm_add_epi32(res_3, res_7)); + res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), + FILTER_BITS - EXTRAPREC_BITS); + + // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 + const __m128i maxval = _mm_set1_epi16((EXTRAPREC_CLAMP_LIMIT(bd)) - 1); + __m128i res = _mm_packs_epi32(res_even, res_odd); + res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); + _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); + } + } + } + + /* Vertical filter */ + { + const __m128i coeffs_y = + _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); + + // coeffs 0 1 0 1 2 3 2 3 + const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs 4 5 4 5 6 7 6 7 + const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs 0 1 0 1 0 1 0 1 + const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); + // coeffs 2 3 2 3 2 3 2 3 + const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); + // coeffs 4 5 4 5 4 5 4 5 + const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); + // coeffs 6 7 6 7 6 7 6 7 + const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); + + const __m128i round_const = + _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) - + (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1))); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + // Filter even-index pixels + const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; + const __m128i src_0 = + _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_2 = + _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_4 = + _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_6 = + _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); + const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); + const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); + const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); + + const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), + _mm_add_epi32(res_4, res_6)); + + // Filter odd-index pixels + const __m128i src_1 = + _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), + *(__m128i *)(data + 1 * MAX_SB_SIZE)); + const __m128i src_3 = + _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), + *(__m128i *)(data + 3 * MAX_SB_SIZE)); + const __m128i src_5 = + _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), + *(__m128i *)(data + 5 * MAX_SB_SIZE)); + const __m128i src_7 = + _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), + *(__m128i *)(data + 7 * MAX_SB_SIZE)); + + const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); + const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); + const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); + const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); + + const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), + _mm_add_epi32(res_5, res_7)); + + // Rearrange pixels back into the order 0 ... 7 + const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); + const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); + + const __m128i res_lo_round = _mm_srai_epi32( + _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS); + const __m128i res_hi_round = _mm_srai_epi32( + _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS); + + const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); + __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); + res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); + + __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; + _mm_storeu_si128(p, res_16bit); + } + } + } +} diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c index bcdc20f63..1a6457402 100644 --- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c +++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c @@ -94,52 +94,6 @@ void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, *min = _mm_extract_epi16(minabsdiff, 0); } -unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) { - __m128i s0, s1, u0; - unsigned int avg = 0; - u0 = _mm_setzero_si128(); - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - - s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); - s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); - s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); - avg = _mm_extract_epi16(s0, 0); - return (avg + 32) >> 6; -} - -unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) { - __m128i s0, s1, u0; - unsigned int avg = 0; - - u0 = _mm_setzero_si128(); - s0 = _mm_unpacklo_epi8(xx_loadl_32(s), u0); - s1 = _mm_unpacklo_epi8(xx_loadl_32(s + p), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 2 * p), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 3 * p), u0); - s0 = _mm_adds_epu16(s0, s1); - - s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); - s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); - avg = _mm_extract_epi16(s0, 0); - return (avg + 8) >> 4; -} - static void hadamard_col8_sse2(__m128i *in, int iter) { __m128i a0 = in[0]; __m128i a1 = in[1]; diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c index 7d96e26ae..133640eb7 100644 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c @@ -14,30 +14,6 @@ #include "./aom_dsp_rtcd.h" #include "aom_dsp/x86/convolve.h" -#define CONV8_ROUNDING_BITS (7) - -static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, - 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, - 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; - -static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, - 8, 9, 10, 11, 10, 11, 12, 13, - 4, 5, 6, 7, 6, 7, 8, 9, - 8, 9, 10, 11, 10, 11, 12, 13 }; - -static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, - 10, 11, 12, 13, 12, 13, 14, 15, - 6, 7, 8, 9, 8, 9, 10, 11, - 10, 11, 12, 13, 12, 13, 14, 15 }; - -static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; - -typedef enum { PACK_8x1, PACK_8x2, PACK_16x1 } PixelPackFormat; - -typedef void (*WritePixels)(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch); - // ----------------------------------------------------------------------------- // Copy and average @@ -216,6 +192,27 @@ void aom_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride, } } +// ----------------------------------------------------------------------------- +// Horizontal and vertical filtering + +#define CONV8_ROUNDING_BITS (7) + +static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, + 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; + +static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13, + 4, 5, 6, 7, 6, 7, 8, 9, + 8, 9, 10, 11, 10, 11, 12, 13 }; + +static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15, + 6, 7, 8, 9, 8, 9, 10, 11, + 10, 11, 12, 13, 12, 13, 14, 15 }; + +static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; + // ----------------------------------------------------------------------------- // Horizontal Filtering @@ -248,52 +245,30 @@ static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); } -static INLINE void pack_pixels_with_format(const uint16_t *src, - PixelPackFormat fmt, - ptrdiff_t stride, __m256i *x) { - switch (fmt) { - case PACK_8x1: { - __m256i pp[8]; - __m256i s0; - s0 = _mm256_loadu_si256((const __m256i *)src); - pack_pixels(&s0, pp); - x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); - x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); - x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); - x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); - break; - } - case PACK_8x2: { - __m256i s0, s1; - s0 = _mm256_loadu_si256((const __m256i *)src); - s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); - pack_16_pixels(&s0, &s1, x); - break; - } - case PACK_16x1: { - __m256i s0, s1; - s0 = _mm256_loadu_si256((const __m256i *)src); - s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); - pack_16_pixels(&s0, &s1, x); - break; - } - default: { assert(0); } - } -} - -static INLINE void pack_8x1_pixels(const uint16_t *src, const ptrdiff_t pitch, - __m256i *x /*x[4]*/) { - pack_pixels_with_format(src, PACK_8x1, pitch, x); +static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { + __m256i pp[8]; + __m256i s0; + s0 = _mm256_loadu_si256((const __m256i *)src); + pack_pixels(&s0, pp); + x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); + x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); + x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); + x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); } -static INLINE void pack_8x2_pixels(const uint16_t *src, const ptrdiff_t pitch, - __m256i *x /*x[8]*/) { - pack_pixels_with_format(src, PACK_8x2, pitch, x); +static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, + __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); + pack_16_pixels(&s0, &s1, x); } -static INLINE void pack_16x1_pixels(const uint16_t *src, const ptrdiff_t pitch, - __m256i *x /*x[8]*/) { - pack_pixels_with_format(src, PACK_16x1, pitch, x); +static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { + __m256i s0, s1; + s0 = _mm256_loadu_si256((const __m256i *)src); + s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); + pack_16_pixels(&s0, &s1, x); } // Note: @@ -323,51 +298,49 @@ static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, a0 = _mm256_madd_epi16(fil[1], sig[1]); a1 = _mm256_madd_epi16(fil[2], sig[2]); - const __m256i min = _mm256_min_epi32(a0, a1); - a = _mm256_add_epi32(a, min); - - const __m256i max = _mm256_max_epi32(a0, a1); - a = _mm256_add_epi32(a, max); - - const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); - a = _mm256_add_epi32(a, rounding); - *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); + { + const __m256i min = _mm256_min_epi32(a0, a1); + a = _mm256_add_epi32(a, min); + } + { + const __m256i max = _mm256_max_epi32(a0, a1); + a = _mm256_add_epi32(a, max); + } + { + const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); + a = _mm256_add_epi32(a, rounding); + *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); + } } -static void write_8x1_pixels(const __m256i *y, const __m256i *z, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { +static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, + uint16_t *dst) { const __m128i a0 = _mm256_castsi256_si128(*y); const __m128i a1 = _mm256_extractf128_si256(*y, 1); __m128i res = _mm_packus_epi32(a0, a1); - (void)z; - (void)pitch; res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); _mm_storeu_si128((__m128i *)dst, res); } -static void write_8x2_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { +static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { __m256i a = _mm256_packus_epi32(*y0, *y1); a = _mm256_min_epi16(a, *mask); _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); } -static void write_16x1_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t dst_pitch) { - (void)dst_pitch; +static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { __m256i a = _mm256_packus_epi32(*y0, *y1); a = _mm256_min_epi16(a, *mask); _mm256_storeu_si256((__m256i *)dst, a); } -static void filter_block_width8_horiz( - const uint16_t *src_ptr, ptrdiff_t src_pitch, const WritePixels write_8x1, - const WritePixels write_8x2, uint16_t *dst_ptr, ptrdiff_t dst_pitch, - uint32_t height, const int16_t *filter, int bd) { +static void aom_highbd_filter_block1d8_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[8], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); @@ -379,32 +352,22 @@ static void filter_block_width8_horiz( pack_8x2_pixels(src_ptr, src_pitch, signal); filter_8x1_pixels(signal, ff, &res0); filter_8x1_pixels(&signal[4], ff, &res1); - write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); height -= 2; src_ptr += src_pitch << 1; dst_ptr += dst_pitch << 1; } while (height > 1); if (height > 0) { - pack_8x1_pixels(src_ptr, src_pitch, signal); + pack_8x1_pixels(src_ptr, signal); filter_8x1_pixels(signal, ff, &res0); - write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch); + store_8x1_pixels(&res0, &max, dst_ptr); } } -static void aom_highbd_filter_block1d8_h8_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, +static void aom_highbd_filter_block1d16_h8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width8_horiz(src, src_pitch, write_8x1_pixels, write_8x2_pixels, - dst, dst_pitch, height, filter, bd); -} - -static void filter_block_width16_horiz(const uint16_t *src_ptr, - ptrdiff_t src_pitch, - const WritePixels write_16x1, - uint16_t *dst_ptr, ptrdiff_t dst_pitch, - uint32_t height, const int16_t *filter, - int bd) { __m256i signal[8], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); @@ -413,23 +376,17 @@ static void filter_block_width16_horiz(const uint16_t *src_ptr, src_ptr -= 3; do { - pack_16x1_pixels(src_ptr, src_pitch, signal); + pack_16x1_pixels(src_ptr, signal); filter_8x1_pixels(signal, ff, &res0); filter_8x1_pixels(&signal[4], ff, &res1); - write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); height -= 1; src_ptr += src_pitch; dst_ptr += dst_pitch; } while (height > 0); } -static void aom_highbd_filter_block1d16_h8_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width16_horiz(src, src_pitch, write_16x1_pixels, dst, dst_pitch, - height, filter, bd); -} - +// ----------------------------------------------------------------------------- // 2-tap horizontal filtering static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { @@ -493,16 +450,6 @@ static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); } -static INLINE void filter_8x2_2t_pixels(const __m256i *sig, const __m256i *f, - __m256i *y0, __m256i *y1) { - filter_16_2t_pixels(sig, f, y0, y1); -} - -static INLINE void filter_16x1_2t_pixels(const __m256i *sig, const __m256i *f, - __m256i *y0, __m256i *y1) { - filter_16_2t_pixels(sig, f, y0, y1); -} - static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, __m256i *y0) { const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); @@ -511,10 +458,9 @@ static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); } -static void filter_block_width8_2t_horiz( - const uint16_t *src_ptr, ptrdiff_t src_pitch, const WritePixels write_8x1, - const WritePixels write_8x2, uint16_t *dst_ptr, ptrdiff_t dst_pitch, - uint32_t height, const int16_t *filter, int bd) { +static void aom_highbd_filter_block1d8_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[2], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); @@ -524,8 +470,8 @@ static void filter_block_width8_2t_horiz( src_ptr -= 3; do { pack_8x2_2t_pixels(src_ptr, src_pitch, signal); - filter_8x2_2t_pixels(signal, &ff, &res0, &res1); - write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); height -= 2; src_ptr += src_pitch << 1; dst_ptr += dst_pitch << 1; @@ -534,24 +480,13 @@ static void filter_block_width8_2t_horiz( if (height > 0) { pack_8x1_2t_pixels(src_ptr, signal); filter_8x1_2t_pixels(signal, &ff, &res0); - write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch); + store_8x1_pixels(&res0, &max, dst_ptr); } } -static void aom_highbd_filter_block1d8_h2_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, +static void aom_highbd_filter_block1d16_h2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width8_2t_horiz(src, src_pitch, write_8x1_pixels, - write_8x2_pixels, dst, dst_pitch, height, filter, - bd); -} - -static void filter_block_width16_2t_horiz(const uint16_t *src_ptr, - ptrdiff_t src_pitch, - const WritePixels write_16x1, - uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, - const int16_t *filter, int bd) { __m256i signal[2], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); @@ -561,21 +496,15 @@ static void filter_block_width16_2t_horiz(const uint16_t *src_ptr, src_ptr -= 3; do { pack_16x1_2t_pixels(src_ptr, signal); - filter_16x1_2t_pixels(signal, &ff, &res0, &res1); - write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); height -= 1; src_ptr += src_pitch; dst_ptr += dst_pitch; } while (height > 0); } -static void aom_highbd_filter_block1d16_h2_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width16_2t_horiz(src, src_pitch, write_16x1_pixels, dst, - dst_pitch, height, filter, bd); -} - +// ----------------------------------------------------------------------------- // Vertical Filtering static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { @@ -638,22 +567,9 @@ static INLINE void update_pixels(__m256i *sig) { } } -static INLINE void write_8x1_pixels_ver(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - (void)pitch; - const __m128i v0 = _mm256_castsi256_si128(*y0); - const __m128i v1 = _mm256_castsi256_si128(*y1); - __m128i p = _mm_packus_epi32(v0, v1); - p = _mm_min_epi16(p, _mm256_castsi256_si128(*mask)); - _mm_storeu_si128((__m128i *)dst, p); -} - -static void filter_block_width8_vert(const uint16_t *src_ptr, - ptrdiff_t src_pitch, WritePixels write_8x1, - WritePixels write_8x2, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, - const int16_t *filter, int bd) { +static void aom_highbd_filter_block1d8_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[9], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); @@ -666,27 +582,13 @@ static void filter_block_width8_vert(const uint16_t *src_ptr, pack_8x9_pixels(src_ptr, src_pitch, signal); filter_8x9_pixels(signal, ff, &res0, &res1); - write_8x2(&res0, &res1, &max, dst_ptr, dst_pitch); + store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); update_pixels(signal); src_ptr += src_pitch << 1; dst_ptr += dst_pitch << 1; height -= 2; - } while (height > 1); - - if (height > 0) { - pack_8x9_pixels(src_ptr, src_pitch, signal); - filter_8x9_pixels(signal, ff, &res0, &res1); - write_8x1(&res0, &res1, &max, dst_ptr, dst_pitch); - } -} - -static void aom_highbd_filter_block1d8_v8_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width8_vert(src, src_pitch, write_8x1_pixels_ver, - write_8x2_pixels, dst, dst_pitch, height, filter, - bd); + } while (height > 0); } static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { @@ -770,13 +672,15 @@ static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, filter_8x1_pixels(&sig[i << 2], f, &res[i]); } - const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); - const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); - *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); - *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); + { + const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); + const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); + *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); + *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); + } } -static INLINE void write_16x2_pixels(const __m256i *y0, const __m256i *y1, +static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, const __m256i *mask, uint16_t *dst, ptrdiff_t pitch) { __m256i p = _mm256_min_epi16(*y0, *mask); @@ -785,26 +689,14 @@ static INLINE void write_16x2_pixels(const __m256i *y0, const __m256i *y1, _mm256_storeu_si256((__m256i *)(dst + pitch), p); } -static INLINE void write_16x1_pixels_ver(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - (void)y1; - (void)pitch; - const __m256i p = _mm256_min_epi16(*y0, *mask); - _mm256_storeu_si256((__m256i *)dst, p); -} - static void update_16x9_pixels(__m256i *sig) { update_pixels(&sig[0]); update_pixels(&sig[8]); } -static void filter_block_width16_vert(const uint16_t *src_ptr, - ptrdiff_t src_pitch, - WritePixels write_16x1, - WritePixels write_16x2, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, - const int16_t *filter, int bd) { +static void aom_highbd_filter_block1d16_v8_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[17], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); @@ -816,29 +708,16 @@ static void filter_block_width16_vert(const uint16_t *src_ptr, do { pack_16x9_pixels(src_ptr, src_pitch, signal); filter_16x9_pixels(signal, ff, &res0, &res1); - write_16x2(&res0, &res1, &max, dst_ptr, dst_pitch); + store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); update_16x9_pixels(signal); src_ptr += src_pitch << 1; dst_ptr += dst_pitch << 1; height -= 2; - } while (height > 1); - - if (height > 0) { - pack_16x9_pixels(src_ptr, src_pitch, signal); - filter_16x9_pixels(signal, ff, &res0, &res1); - write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); - } -} - -static void aom_highbd_filter_block1d16_v8_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width16_vert(src, src_pitch, write_16x1_pixels_ver, - write_16x2_pixels, dst, dst_pitch, height, filter, - bd); + } while (height > 0); } +// ----------------------------------------------------------------------------- // 2-tap vertical filtering static void pack_16x2_init(const uint16_t *src, __m256i *sig) { @@ -859,12 +738,9 @@ static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, filter_16_2t_pixels(sig, f, y0, y1); } -static void filter_block_width16_2t_vert(const uint16_t *src_ptr, - ptrdiff_t src_pitch, - WritePixels write_16x1, - uint16_t *dst_ptr, ptrdiff_t dst_pitch, - uint32_t height, const int16_t *filter, - int bd) { +static void aom_highbd_filter_block1d16_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m256i signal[3], res0, res1; const __m256i max = _mm256_set1_epi16((1 << bd) - 1); __m256i ff; @@ -875,7 +751,7 @@ static void filter_block_width16_2t_vert(const uint16_t *src_ptr, do { pack_16x2_2t_pixels(src_ptr, src_pitch, signal); filter_16x2_2t_pixels(signal, &ff, &res0, &res1); - write_16x1(&res0, &res1, &max, dst_ptr, dst_pitch); + store_16x1_pixels(&res0, &res1, &max, dst_ptr); src_ptr += src_pitch; dst_ptr += dst_pitch; @@ -883,13 +759,6 @@ static void filter_block_width16_2t_vert(const uint16_t *src_ptr, } while (height > 0); } -static void aom_highbd_filter_block1d16_v2_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width16_2t_vert(src, src_pitch, write_16x1_pixels, dst, - dst_pitch, height, filter, bd); -} - static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { const __m128i h = _mm_loadu_si128((const __m128i *)filter); const __m128i p = _mm_set1_epi32(0x09080706); @@ -920,22 +789,16 @@ static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); } -static void write_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, - const __m128i *mask, uint16_t *dst) { +static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, + const __m128i *mask, uint16_t *dst) { __m128i res = _mm_packus_epi32(*y0, *y1); res = _mm_min_epi16(res, *mask); _mm_storeu_si128((__m128i *)dst, res); } -typedef void (*Write8Pixels)(const __m128i *y0, const __m128i *y1, - const __m128i *mask, uint16_t *dst); - -static void filter_block_width8_2t_vert(const uint16_t *src_ptr, - ptrdiff_t src_pitch, - Write8Pixels write_8x1, - uint16_t *dst_ptr, ptrdiff_t dst_pitch, - uint32_t height, const int16_t *filter, - int bd) { +static void aom_highbd_filter_block1d8_v2_avx2( + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, + ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { __m128i signal[3], res0, res1; const __m128i max = _mm_set1_epi16((1 << bd) - 1); __m128i ff; @@ -946,7 +809,7 @@ static void filter_block_width8_2t_vert(const uint16_t *src_ptr, do { pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); filter_8_2t_pixels(signal, &ff, &res0, &res1); - write_8x1(&res0, &res1, &max, dst_ptr); + store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); src_ptr += src_pitch; dst_ptr += dst_pitch; @@ -954,20 +817,10 @@ static void filter_block_width8_2t_vert(const uint16_t *src_ptr, } while (height > 0); } -static void aom_highbd_filter_block1d8_v2_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width8_2t_vert(src, src_pitch, write_8x1_2t_pixels_ver, dst, - dst_pitch, height, filter, bd); -} - // Calculation with averaging the input pixels -static void write_8x1_avg_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - (void)y1; - (void)pitch; +static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask, + uint16_t *dst) { const __m128i a0 = _mm256_castsi256_si128(*y0); const __m128i a1 = _mm256_extractf128_si256(*y0, 1); __m128i res = _mm_packus_epi32(a0, a1); @@ -977,9 +830,9 @@ static void write_8x1_avg_pixels(const __m256i *y0, const __m256i *y1, _mm_storeu_si128((__m128i *)dst, res); } -static void write_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { +static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst, + ptrdiff_t pitch) { __m256i a = _mm256_packus_epi32(*y0, *y1); const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); @@ -991,10 +844,8 @@ static void write_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); } -static void write_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - (void)pitch; +static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, + const __m256i *mask, uint16_t *dst) { __m256i a = _mm256_packus_epi32(*y0, *y1); const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); a = _mm256_min_epi16(a, *mask); @@ -1002,21 +853,7 @@ static void write_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, _mm256_storeu_si256((__m256i *)dst, a); } -static INLINE void write_8x1_avg_pixels_ver(const __m256i *y0, - const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - (void)pitch; - const __m128i v0 = _mm256_castsi256_si128(*y0); - const __m128i v1 = _mm256_castsi256_si128(*y1); - __m128i p = _mm_packus_epi32(v0, v1); - const __m128i pix = _mm_loadu_si128((const __m128i *)dst); - p = _mm_min_epi16(p, _mm256_castsi256_si128(*mask)); - p = _mm_avg_epu16(p, pix); - _mm_storeu_si128((__m128i *)dst, p); -} - -static INLINE void write_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, +static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, const __m256i *mask, uint16_t *dst, ptrdiff_t pitch) { const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); @@ -1030,20 +867,10 @@ static INLINE void write_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, _mm256_storeu_si256((__m256i *)(dst + pitch), p); } -static INLINE void write_16x1_avg_pixels_ver(const __m256i *y0, - const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - (void)y1; - (void)pitch; - __m256i p = _mm256_min_epi16(*y0, *mask); - const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); - p = _mm256_avg_epu16(p, pix); - _mm256_storeu_si256((__m256i *)dst, p); -} - -static void write_8x1_2t_avg_pixels_ver(const __m128i *y0, const __m128i *y1, - const __m128i *mask, uint16_t *dst) { +static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0, + const __m128i *y1, + const __m128i *mask, + uint16_t *dst) { __m128i res = _mm_packus_epi32(*y0, *y1); const __m128i pix = _mm_loadu_si128((const __m128i *)dst); res = _mm_min_epi16(res, *mask); @@ -1052,96 +879,229 @@ static void write_8x1_2t_avg_pixels_ver(const __m128i *y0, const __m128i *y1, } static void aom_highbd_filter_block1d8_h8_avg_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width8_horiz(src, src_pitch, write_8x1_avg_pixels, - write_8x2_avg_pixels, dst, dst_pitch, height, - filter, bd); + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_8x2_pixels(src_ptr, src_pitch, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + store_8x1_avg_pixels(&res0, &max, dst_ptr); + } } static void aom_highbd_filter_block1d16_h8_avg_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width16_horiz(src, src_pitch, write_16x1_avg_pixels, dst, - dst_pitch, height, filter, bd); + __m256i signal[8], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + src_ptr -= 3; + do { + pack_16x1_pixels(src_ptr, signal); + filter_8x1_pixels(signal, ff, &res0); + filter_8x1_pixels(&signal[4], ff, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); } static void aom_highbd_filter_block1d8_v8_avg_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width8_vert(src, src_pitch, write_8x1_avg_pixels_ver, - write_8x2_avg_pixels, dst, dst_pitch, height, filter, - bd); + __m256i signal[9], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_8x9_init(src_ptr, src_pitch, signal); + + do { + pack_8x9_pixels(src_ptr, src_pitch, signal); + + filter_8x9_pixels(signal, ff, &res0, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_pixels(signal); + + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); } static void aom_highbd_filter_block1d16_v8_avg_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width16_vert(src, src_pitch, write_16x1_avg_pixels_ver, - write_16x2_avg_pixels, dst, dst_pitch, height, - filter, bd); -} + __m256i signal[17], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff[4]; + pack_filters(filter, ff); + + pack_16x9_init(src_ptr, src_pitch, signal); + + do { + pack_16x9_pixels(src_ptr, src_pitch, signal); + filter_16x9_pixels(signal, ff, &res0, &res1); + store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + update_16x9_pixels(signal); -// 2-tap averaging + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + height -= 2; + } while (height > 0); +} static void aom_highbd_filter_block1d8_h2_avg_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width8_2t_horiz(src, src_pitch, write_8x1_avg_pixels, - write_8x2_avg_pixels, dst, dst_pitch, height, - filter, bd); + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_8x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); + height -= 2; + src_ptr += src_pitch << 1; + dst_ptr += dst_pitch << 1; + } while (height > 1); + + if (height > 0) { + pack_8x1_2t_pixels(src_ptr, signal); + filter_8x1_2t_pixels(signal, &ff, &res0); + store_8x1_avg_pixels(&res0, &max, dst_ptr); + } } static void aom_highbd_filter_block1d16_h2_avg_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width16_2t_horiz(src, src_pitch, write_16x1_avg_pixels, dst, - dst_pitch, height, filter, bd); + __m256i signal[2], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + + __m256i ff; + pack_2t_filter(filter, &ff); + + src_ptr -= 3; + do { + pack_16x1_2t_pixels(src_ptr, signal); + filter_16_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + height -= 1; + src_ptr += src_pitch; + dst_ptr += dst_pitch; + } while (height > 0); } static void aom_highbd_filter_block1d16_v2_avg_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width16_2t_vert(src, src_pitch, write_16x1_avg_pixels, dst, - dst_pitch, height, filter, bd); + __m256i signal[3], res0, res1; + const __m256i max = _mm256_set1_epi16((1 << bd) - 1); + __m256i ff; + + pack_2t_filter(filter, &ff); + pack_16x2_init(src_ptr, signal); + + do { + pack_16x2_2t_pixels(src_ptr, src_pitch, signal); + filter_16x2_2t_pixels(signal, &ff, &res0, &res1); + store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); + + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); } static void aom_highbd_filter_block1d8_v2_avg_avx2( - const uint16_t *src, ptrdiff_t src_pitch, uint16_t *dst, + const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - filter_block_width8_2t_vert(src, src_pitch, write_8x1_2t_avg_pixels_ver, dst, - dst_pitch, height, filter, bd); -} + __m128i signal[3], res0, res1; + const __m128i max = _mm_set1_epi16((1 << bd) - 1); + __m128i ff; -typedef void HbdFilter1dFunc(const uint16_t *, ptrdiff_t, uint16_t *, ptrdiff_t, - uint32_t, const int16_t *, int); + pack_8x1_2t_filter(filter, &ff); + pack_8x2_init(src_ptr, signal); -#define HIGHBD_FUNC(width, dir, avg, opt) \ - aom_highbd_filter_block1d##width##_##dir##_##avg##opt + do { + pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); + filter_8_2t_pixels(signal, &ff, &res0, &res1); + store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr); -HbdFilter1dFunc HIGHBD_FUNC(4, h8, , sse2); -HbdFilter1dFunc HIGHBD_FUNC(4, h2, , sse2); -HbdFilter1dFunc HIGHBD_FUNC(4, v8, , sse2); -HbdFilter1dFunc HIGHBD_FUNC(4, v2, , sse2); + src_ptr += src_pitch; + dst_ptr += dst_pitch; + height -= 1; + } while (height > 0); +} -#define aom_highbd_filter_block1d4_h8_avx2 HIGHBD_FUNC(4, h8, , sse2) -#define aom_highbd_filter_block1d4_h2_avx2 HIGHBD_FUNC(4, h2, , sse2) -#define aom_highbd_filter_block1d4_v8_avx2 HIGHBD_FUNC(4, v8, , sse2) -#define aom_highbd_filter_block1d4_v2_avx2 HIGHBD_FUNC(4, v2, , sse2) +void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, + ptrdiff_t, uint32_t, const int16_t *, + int); +#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2 +#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2 +#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2 +#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); HIGH_FUN_CONV_2D(, avx2); -HbdFilter1dFunc HIGHBD_FUNC(4, h8, avg_, sse2); -HbdFilter1dFunc HIGHBD_FUNC(4, h2, avg_, sse2); -HbdFilter1dFunc HIGHBD_FUNC(4, v8, avg_, sse2); -HbdFilter1dFunc HIGHBD_FUNC(4, v2, avg_, sse2); - -#define aom_highbd_filter_block1d4_h8_avg_avx2 HIGHBD_FUNC(4, h8, avg_, sse2) -#define aom_highbd_filter_block1d4_h2_avg_avx2 HIGHBD_FUNC(4, h2, avg_, sse2) -#define aom_highbd_filter_block1d4_v8_avg_avx2 HIGHBD_FUNC(4, v8, avg_, sse2) -#define aom_highbd_filter_block1d4_v2_avg_avx2 HIGHBD_FUNC(4, v2, avg_, sse2) +void aom_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +void aom_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +void aom_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +void aom_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, + uint16_t *, ptrdiff_t, uint32_t, + const int16_t *, int); +#define aom_highbd_filter_block1d4_h8_avg_avx2 \ + aom_highbd_filter_block1d4_h8_avg_sse2 +#define aom_highbd_filter_block1d4_h2_avg_avx2 \ + aom_highbd_filter_block1d4_h2_avg_sse2 +#define aom_highbd_filter_block1d4_v8_avg_avx2 \ + aom_highbd_filter_block1d4_v8_avg_sse2 +#define aom_highbd_filter_block1d4_v2_avg_avx2 \ + aom_highbd_filter_block1d4_v2_avg_sse2 HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2); HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c b/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c new file mode 100644 index 000000000..a9d6a127c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c @@ -0,0 +1,1238 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/inv_txfm.h" +#include "aom_dsp/x86/inv_txfm_common_avx2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +void aom_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i in[16]; + load_buffer_16x16(input, in); + mm256_transpose_16x16(in, in); + av1_idct16_avx2(in); + mm256_transpose_16x16(in, in); + av1_idct16_avx2(in); + store_buffer_16xN(in, stride, dest, 16); +} + +static INLINE void transpose_col_to_row_nz4x4(__m256i *in /*in[4]*/) { + const __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]); + const __m256i u1 = _mm256_unpacklo_epi16(in[2], in[3]); + const __m256i v0 = _mm256_unpacklo_epi32(u0, u1); + const __m256i v1 = _mm256_unpackhi_epi32(u0, u1); + in[0] = _mm256_permute4x64_epi64(v0, 0xA8); + in[1] = _mm256_permute4x64_epi64(v0, 0xA9); + in[2] = _mm256_permute4x64_epi64(v1, 0xA8); + in[3] = _mm256_permute4x64_epi64(v1, 0xA9); +} + +#define MM256_SHUFFLE_EPI64(x0, x1, imm8) \ + _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(x0), \ + _mm256_castsi256_pd(x1), imm8)) + +static INLINE void transpose_col_to_row_nz4x16(__m256i *in /*in[16]*/) { + int i; + for (i = 0; i < 16; i += 4) { + transpose_col_to_row_nz4x4(&in[i]); + } + + for (i = 0; i < 4; ++i) { + in[i] = MM256_SHUFFLE_EPI64(in[i], in[i + 4], 0); + in[i + 8] = MM256_SHUFFLE_EPI64(in[i + 8], in[i + 12], 0); + } + + for (i = 0; i < 4; ++i) { + in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20); + } +} + +// Coefficients 0-7 before the final butterfly +static INLINE void idct16_10_first_half(const __m256i *in, __m256i *out) { + const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + const __m256i v4 = _mm256_mulhrs_epi16(in[2], c2p28); + const __m256i v7 = _mm256_mulhrs_epi16(in[2], c2p04); + + const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m256i v0 = _mm256_mulhrs_epi16(in[0], c2p16); + const __m256i v1 = v0; + + const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + __m256i v5, v6; + unpack_butter_fly(&v7, &v4, &cospi_p16_m16, &cospi_p16_p16, &v5, &v6); + + out[0] = _mm256_add_epi16(v0, v7); + out[1] = _mm256_add_epi16(v1, v6); + out[2] = _mm256_add_epi16(v1, v5); + out[3] = _mm256_add_epi16(v0, v4); + out[4] = _mm256_sub_epi16(v0, v4); + out[5] = _mm256_sub_epi16(v1, v5); + out[6] = _mm256_sub_epi16(v1, v6); + out[7] = _mm256_sub_epi16(v0, v7); +} + +// Coefficients 8-15 before the final butterfly +static INLINE void idct16_10_second_half(const __m256i *in, __m256i *out) { + const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); + const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); + const __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30); + const __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02); + + const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); + const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); + const __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26); + const __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06); + + const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + + __m256i t1, t2, t5, t6; + unpack_butter_fly(&t0, &t7, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6); + unpack_butter_fly(&t3, &t4, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5); + + out[0] = _mm256_add_epi16(t0, t3); + out[1] = _mm256_add_epi16(t1, t2); + out[6] = _mm256_add_epi16(t6, t5); + out[7] = _mm256_add_epi16(t7, t4); + + const __m256i v2 = _mm256_sub_epi16(t1, t2); + const __m256i v3 = _mm256_sub_epi16(t0, t3); + const __m256i v4 = _mm256_sub_epi16(t7, t4); + const __m256i v5 = _mm256_sub_epi16(t6, t5); + const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]); + unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]); +} + +static INLINE void add_sub_butterfly(const __m256i *in, __m256i *out, + int size) { + int i = 0; + const int num = size >> 1; + const int bound = size - 1; + while (i < num) { + out[i] = _mm256_add_epi16(in[i], in[bound - i]); + out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]); + i++; + } +} + +static INLINE void idct16_10(__m256i *in /*in[16]*/) { + __m256i out[16]; + idct16_10_first_half(in, out); + idct16_10_second_half(in, &out[8]); + add_sub_butterfly(out, in, 16); +} + +void aom_idct16x16_10_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i in[16]; + + load_coeff(input, &in[0]); + load_coeff(input + 16, &in[1]); + load_coeff(input + 32, &in[2]); + load_coeff(input + 48, &in[3]); + + transpose_col_to_row_nz4x4(in); + idct16_10(in); + + transpose_col_to_row_nz4x16(in); + idct16_10(in); + + store_buffer_16xN(in, stride, dest, 16); +} + +// Note: +// For 16x16 int16_t matrix +// transpose first 8 columns into first 8 rows. +// Since only upper-left 8x8 are non-zero, the input are first 8 rows (in[8]). +// After transposing, the 8 row vectors are in in[8]. +void transpose_col_to_row_nz8x8(__m256i *in /*in[8]*/) { + __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]); + __m256i u1 = _mm256_unpackhi_epi16(in[0], in[1]); + __m256i u2 = _mm256_unpacklo_epi16(in[2], in[3]); + __m256i u3 = _mm256_unpackhi_epi16(in[2], in[3]); + + const __m256i v0 = _mm256_unpacklo_epi32(u0, u2); + const __m256i v1 = _mm256_unpackhi_epi32(u0, u2); + const __m256i v2 = _mm256_unpacklo_epi32(u1, u3); + const __m256i v3 = _mm256_unpackhi_epi32(u1, u3); + + u0 = _mm256_unpacklo_epi16(in[4], in[5]); + u1 = _mm256_unpackhi_epi16(in[4], in[5]); + u2 = _mm256_unpacklo_epi16(in[6], in[7]); + u3 = _mm256_unpackhi_epi16(in[6], in[7]); + + const __m256i v4 = _mm256_unpacklo_epi32(u0, u2); + const __m256i v5 = _mm256_unpackhi_epi32(u0, u2); + const __m256i v6 = _mm256_unpacklo_epi32(u1, u3); + const __m256i v7 = _mm256_unpackhi_epi32(u1, u3); + + in[0] = MM256_SHUFFLE_EPI64(v0, v4, 0); + in[1] = MM256_SHUFFLE_EPI64(v0, v4, 3); + in[2] = MM256_SHUFFLE_EPI64(v1, v5, 0); + in[3] = MM256_SHUFFLE_EPI64(v1, v5, 3); + in[4] = MM256_SHUFFLE_EPI64(v2, v6, 0); + in[5] = MM256_SHUFFLE_EPI64(v2, v6, 3); + in[6] = MM256_SHUFFLE_EPI64(v3, v7, 0); + in[7] = MM256_SHUFFLE_EPI64(v3, v7, 3); +} + +// Note: +// For 16x16 int16_t matrix +// transpose first 8 columns into first 8 rows. +// Since only matrix left 8x16 are non-zero, the input are total 16 rows +// (in[16]). +// After transposing, the 8 row vectors are in in[8]. All else are zero. +static INLINE void transpose_col_to_row_nz8x16(__m256i *in /*in[16]*/) { + transpose_col_to_row_nz8x8(in); + transpose_col_to_row_nz8x8(&in[8]); + + int i; + for (i = 0; i < 8; ++i) { + in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20); + } +} + +static INLINE void idct16_38_first_half(const __m256i *in, __m256i *out) { + const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + __m256i t4 = _mm256_mulhrs_epi16(in[2], c2p28); + __m256i t7 = _mm256_mulhrs_epi16(in[2], c2p04); + + const __m256i c2m20 = pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); + const __m256i c2p12 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); + __m256i t5 = _mm256_mulhrs_epi16(in[6], c2m20); + __m256i t6 = _mm256_mulhrs_epi16(in[6], c2p12); + + const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m256i c2p24 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); + const __m256i c2p08 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); + const __m256i u0 = _mm256_mulhrs_epi16(in[0], c2p16); + const __m256i u1 = _mm256_mulhrs_epi16(in[0], c2p16); + const __m256i u2 = _mm256_mulhrs_epi16(in[4], c2p24); + const __m256i u3 = _mm256_mulhrs_epi16(in[4], c2p08); + + const __m256i u4 = _mm256_add_epi16(t4, t5); + const __m256i u5 = _mm256_sub_epi16(t4, t5); + const __m256i u6 = _mm256_sub_epi16(t7, t6); + const __m256i u7 = _mm256_add_epi16(t7, t6); + + const __m256i t0 = _mm256_add_epi16(u0, u3); + const __m256i t1 = _mm256_add_epi16(u1, u2); + const __m256i t2 = _mm256_sub_epi16(u1, u2); + const __m256i t3 = _mm256_sub_epi16(u0, u3); + + t4 = u4; + t7 = u7; + + const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6); + + out[0] = _mm256_add_epi16(t0, t7); + out[1] = _mm256_add_epi16(t1, t6); + out[2] = _mm256_add_epi16(t2, t5); + out[3] = _mm256_add_epi16(t3, t4); + out[4] = _mm256_sub_epi16(t3, t4); + out[5] = _mm256_sub_epi16(t2, t5); + out[6] = _mm256_sub_epi16(t1, t6); + out[7] = _mm256_sub_epi16(t0, t7); +} + +static INLINE void idct16_38_second_half(const __m256i *in, __m256i *out) { + const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); + const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); + __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30); + __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02); + + const __m256i c2m18 = pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); + const __m256i c2p14 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); + __m256i t1 = _mm256_mulhrs_epi16(in[7], c2m18); + __m256i t6 = _mm256_mulhrs_epi16(in[7], c2p14); + + const __m256i c2p22 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); + const __m256i c2p10 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); + __m256i t2 = _mm256_mulhrs_epi16(in[5], c2p22); + __m256i t5 = _mm256_mulhrs_epi16(in[5], c2p10); + + const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); + const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); + __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26); + __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06); + + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + v0 = _mm256_add_epi16(t0, t1); + v1 = _mm256_sub_epi16(t0, t1); + v2 = _mm256_sub_epi16(t3, t2); + v3 = _mm256_add_epi16(t2, t3); + v4 = _mm256_add_epi16(t4, t5); + v5 = _mm256_sub_epi16(t4, t5); + v6 = _mm256_sub_epi16(t7, t6); + v7 = _mm256_add_epi16(t6, t7); + + t0 = v0; + t7 = v7; + t3 = v3; + t4 = v4; + const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6); + unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5); + + v0 = _mm256_add_epi16(t0, t3); + v1 = _mm256_add_epi16(t1, t2); + v2 = _mm256_sub_epi16(t1, t2); + v3 = _mm256_sub_epi16(t0, t3); + v4 = _mm256_sub_epi16(t7, t4); + v5 = _mm256_sub_epi16(t6, t5); + v6 = _mm256_add_epi16(t6, t5); + v7 = _mm256_add_epi16(t7, t4); + + // stage 6, (8-15) + out[0] = v0; + out[1] = v1; + out[6] = v6; + out[7] = v7; + const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); + const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]); + unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]); +} + +static INLINE void idct16_38(__m256i *in /*in[16]*/) { + __m256i out[16]; + idct16_38_first_half(in, out); + idct16_38_second_half(in, &out[8]); + add_sub_butterfly(out, in, 16); +} + +void aom_idct16x16_38_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i in[16]; + + int i; + for (i = 0; i < 8; ++i) { + load_coeff(input + (i << 4), &in[i]); + } + + transpose_col_to_row_nz8x8(in); + idct16_38(in); + + transpose_col_to_row_nz8x16(in); + idct16_38(in); + + store_buffer_16xN(in, stride, dest, 16); +} + +static INLINE int calculate_dc(const tran_low_t *input) { + int dc = (int)dct_const_round_shift(input[0] * cospi_16_64); + dc = (int)dct_const_round_shift(dc * cospi_16_64); + dc = ROUND_POWER_OF_TWO(dc, IDCT_ROUNDING_POS); + return dc; +} + +void aom_idct16x16_1_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + const int dc = calculate_dc(input); + if (dc == 0) return; + + const __m256i dc_value = _mm256_set1_epi16(dc); + + int i; + for (i = 0; i < 16; ++i) { + recon_and_store(&dc_value, dest); + dest += stride; + } +} + +// ----------------------------------------------------------------------------- +// 32x32 partial IDCT + +void aom_idct32x32_1_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + const int dc = calculate_dc(input); + if (dc == 0) return; + + const __m256i dc_value = _mm256_set1_epi16(dc); + + int i; + for (i = 0; i < 32; ++i) { + recon_and_store(&dc_value, dest); + recon_and_store(&dc_value, dest + 16); + dest += stride; + } +} + +static void load_buffer_32x16(const tran_low_t *input, __m256i *in /*in[32]*/) { + int i; + for (i = 0; i < 16; ++i) { + load_coeff(input, &in[i]); + load_coeff(input + 16, &in[i + 16]); + input += 32; + } +} + +// Note: +// We extend SSSE3 operations to AVX2. Instead of operating on __m128i, we +// operate coefficients on __m256i. Our operation capacity doubles for each +// instruction. +#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ + do { \ + tmp0 = _mm256_madd_epi16(x0, co0); \ + tmp1 = _mm256_madd_epi16(x1, co0); \ + tmp2 = _mm256_madd_epi16(x0, co1); \ + tmp3 = _mm256_madd_epi16(x1, co1); \ + tmp0 = _mm256_add_epi32(tmp0, rounding); \ + tmp1 = _mm256_add_epi32(tmp1, rounding); \ + tmp2 = _mm256_add_epi32(tmp2, rounding); \ + tmp3 = _mm256_add_epi32(tmp3, rounding); \ + tmp0 = _mm256_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm256_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm256_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm256_srai_epi32(tmp3, DCT_CONST_BITS); \ + } while (0) + +static INLINE void butterfly(const __m256i *x0, const __m256i *x1, + const __m256i *c0, const __m256i *c1, __m256i *y0, + __m256i *y1) { + __m256i tmp0, tmp1, tmp2, tmp3, u0, u1; + const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + + u0 = _mm256_unpacklo_epi16(*x0, *x1); + u1 = _mm256_unpackhi_epi16(*x0, *x1); + BUTTERFLY_PAIR(u0, u1, *c0, *c1); + *y0 = _mm256_packs_epi32(tmp0, tmp1); + *y1 = _mm256_packs_epi32(tmp2, tmp3); +} + +static INLINE void butterfly_self(__m256i *x0, __m256i *x1, const __m256i *c0, + const __m256i *c1) { + __m256i tmp0, tmp1, tmp2, tmp3, u0, u1; + const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); + + u0 = _mm256_unpacklo_epi16(*x0, *x1); + u1 = _mm256_unpackhi_epi16(*x0, *x1); + BUTTERFLY_PAIR(u0, u1, *c0, *c1); + *x0 = _mm256_packs_epi32(tmp0, tmp1); + *x1 = _mm256_packs_epi32(tmp2, tmp3); +} + +// For each 16x32 block __m256i in[32], +// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 +// output pixels: 8-15 in __m256i in[32] +static void idct32_full_16x32_quarter_2(const __m256i *in /*in[32]*/, + __m256i *out /*out[16]*/) { + __m256i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ + __m256i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ + + { + const __m256i stg2_0 = pair256_set_epi16(cospi_30_64, -cospi_2_64); + const __m256i stg2_1 = pair256_set_epi16(cospi_2_64, cospi_30_64); + const __m256i stg2_2 = pair256_set_epi16(cospi_14_64, -cospi_18_64); + const __m256i stg2_3 = pair256_set_epi16(cospi_18_64, cospi_14_64); + butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); + butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); + } + + v8 = _mm256_add_epi16(u8, u9); + v9 = _mm256_sub_epi16(u8, u9); + v14 = _mm256_sub_epi16(u15, u14); + v15 = _mm256_add_epi16(u15, u14); + + { + const __m256i stg2_4 = pair256_set_epi16(cospi_22_64, -cospi_10_64); + const __m256i stg2_5 = pair256_set_epi16(cospi_10_64, cospi_22_64); + const __m256i stg2_6 = pair256_set_epi16(cospi_6_64, -cospi_26_64); + const __m256i stg2_7 = pair256_set_epi16(cospi_26_64, cospi_6_64); + butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); + butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); + } + + v10 = _mm256_sub_epi16(u11, u10); + v11 = _mm256_add_epi16(u11, u10); + v12 = _mm256_add_epi16(u12, u13); + v13 = _mm256_sub_epi16(u12, u13); + + { + const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&v9, &v14, &stg4_4, &stg4_5); + butterfly_self(&v10, &v13, &stg4_6, &stg4_4); + } + + out[0] = _mm256_add_epi16(v8, v11); + out[1] = _mm256_add_epi16(v9, v10); + out[6] = _mm256_add_epi16(v14, v13); + out[7] = _mm256_add_epi16(v15, v12); + + out[2] = _mm256_sub_epi16(v9, v10); + out[3] = _mm256_sub_epi16(v8, v11); + out[4] = _mm256_sub_epi16(v15, v12); + out[5] = _mm256_sub_epi16(v14, v13); + + { + const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); + const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); + butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); + } +} + +// For each 8x32 block __m256i in[32], +// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 +// output pixels: 0-7 in __m256i in[32] +static void idct32_full_16x32_quarter_1(const __m256i *in /*in[32]*/, + __m256i *out /*out[8]*/) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ + __m256i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ + + { + const __m256i stg3_0 = pair256_set_epi16(cospi_28_64, -cospi_4_64); + const __m256i stg3_1 = pair256_set_epi16(cospi_4_64, cospi_28_64); + const __m256i stg3_2 = pair256_set_epi16(cospi_12_64, -cospi_20_64); + const __m256i stg3_3 = pair256_set_epi16(cospi_20_64, cospi_12_64); + butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); + butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); + } + + v4 = _mm256_add_epi16(u4, u5); + v5 = _mm256_sub_epi16(u4, u5); + v6 = _mm256_sub_epi16(u7, u6); + v7 = _mm256_add_epi16(u7, u6); + + { + const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); + const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + const __m256i stg4_2 = pair256_set_epi16(cospi_24_64, -cospi_8_64); + const __m256i stg4_3 = pair256_set_epi16(cospi_8_64, cospi_24_64); + butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); + + butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); + butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); + } + + v0 = _mm256_add_epi16(u0, u3); + v1 = _mm256_add_epi16(u1, u2); + v2 = _mm256_sub_epi16(u1, u2); + v3 = _mm256_sub_epi16(u0, u3); + + out[0] = _mm256_add_epi16(v0, v7); + out[1] = _mm256_add_epi16(v1, v6); + out[2] = _mm256_add_epi16(v2, v5); + out[3] = _mm256_add_epi16(v3, v4); + out[4] = _mm256_sub_epi16(v3, v4); + out[5] = _mm256_sub_epi16(v2, v5); + out[6] = _mm256_sub_epi16(v1, v6); + out[7] = _mm256_sub_epi16(v0, v7); +} + +// For each 8x32 block __m256i in[32], +// Input with odd index, +// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 +// output pixels: 16-23, 24-31 in __m256i in[32] +// We avoid hide an offset, 16, inside this function. So we output 0-15 into +// array out[16] +static void idct32_full_16x32_quarter_3_4(const __m256i *in /*in[32]*/, + __m256i *out /*out[16]*/) { + __m256i v16, v17, v18, v19, v20, v21, v22, v23; + __m256i v24, v25, v26, v27, v28, v29, v30, v31; + __m256i u16, u17, u18, u19, u20, u21, u22, u23; + __m256i u24, u25, u26, u27, u28, u29, u30, u31; + + { + const __m256i stg1_0 = pair256_set_epi16(cospi_31_64, -cospi_1_64); + const __m256i stg1_1 = pair256_set_epi16(cospi_1_64, cospi_31_64); + const __m256i stg1_2 = pair256_set_epi16(cospi_15_64, -cospi_17_64); + const __m256i stg1_3 = pair256_set_epi16(cospi_17_64, cospi_15_64); + const __m256i stg1_4 = pair256_set_epi16(cospi_23_64, -cospi_9_64); + const __m256i stg1_5 = pair256_set_epi16(cospi_9_64, cospi_23_64); + const __m256i stg1_6 = pair256_set_epi16(cospi_7_64, -cospi_25_64); + const __m256i stg1_7 = pair256_set_epi16(cospi_25_64, cospi_7_64); + const __m256i stg1_8 = pair256_set_epi16(cospi_27_64, -cospi_5_64); + const __m256i stg1_9 = pair256_set_epi16(cospi_5_64, cospi_27_64); + const __m256i stg1_10 = pair256_set_epi16(cospi_11_64, -cospi_21_64); + const __m256i stg1_11 = pair256_set_epi16(cospi_21_64, cospi_11_64); + const __m256i stg1_12 = pair256_set_epi16(cospi_19_64, -cospi_13_64); + const __m256i stg1_13 = pair256_set_epi16(cospi_13_64, cospi_19_64); + const __m256i stg1_14 = pair256_set_epi16(cospi_3_64, -cospi_29_64); + const __m256i stg1_15 = pair256_set_epi16(cospi_29_64, cospi_3_64); + butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); + butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); + butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); + butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); + + butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); + butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); + + butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); + butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); + } + + v16 = _mm256_add_epi16(u16, u17); + v17 = _mm256_sub_epi16(u16, u17); + v18 = _mm256_sub_epi16(u19, u18); + v19 = _mm256_add_epi16(u19, u18); + + v20 = _mm256_add_epi16(u20, u21); + v21 = _mm256_sub_epi16(u20, u21); + v22 = _mm256_sub_epi16(u23, u22); + v23 = _mm256_add_epi16(u23, u22); + + v24 = _mm256_add_epi16(u24, u25); + v25 = _mm256_sub_epi16(u24, u25); + v26 = _mm256_sub_epi16(u27, u26); + v27 = _mm256_add_epi16(u27, u26); + + v28 = _mm256_add_epi16(u28, u29); + v29 = _mm256_sub_epi16(u28, u29); + v30 = _mm256_sub_epi16(u31, u30); + v31 = _mm256_add_epi16(u31, u30); + + { + const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64); + const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64); + const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); + const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64); + const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64); + const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); + butterfly_self(&v17, &v30, &stg3_4, &stg3_5); + butterfly_self(&v18, &v29, &stg3_6, &stg3_4); + butterfly_self(&v21, &v26, &stg3_8, &stg3_9); + butterfly_self(&v22, &v25, &stg3_10, &stg3_8); + } + + u16 = _mm256_add_epi16(v16, v19); + u17 = _mm256_add_epi16(v17, v18); + u18 = _mm256_sub_epi16(v17, v18); + u19 = _mm256_sub_epi16(v16, v19); + u20 = _mm256_sub_epi16(v23, v20); + u21 = _mm256_sub_epi16(v22, v21); + u22 = _mm256_add_epi16(v22, v21); + u23 = _mm256_add_epi16(v23, v20); + + u24 = _mm256_add_epi16(v24, v27); + u25 = _mm256_add_epi16(v25, v26); + u26 = _mm256_sub_epi16(v25, v26); + u27 = _mm256_sub_epi16(v24, v27); + + u28 = _mm256_sub_epi16(v31, v28); + u29 = _mm256_sub_epi16(v30, v29); + u30 = _mm256_add_epi16(v29, v30); + u31 = _mm256_add_epi16(v28, v31); + + { + const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); + } + + out[0] = _mm256_add_epi16(u16, u23); + out[1] = _mm256_add_epi16(u17, u22); + out[2] = _mm256_add_epi16(u18, u21); + out[3] = _mm256_add_epi16(u19, u20); + out[4] = _mm256_sub_epi16(u19, u20); + out[5] = _mm256_sub_epi16(u18, u21); + out[6] = _mm256_sub_epi16(u17, u22); + out[7] = _mm256_sub_epi16(u16, u23); + + out[8] = _mm256_sub_epi16(u31, u24); + out[9] = _mm256_sub_epi16(u30, u25); + out[10] = _mm256_sub_epi16(u29, u26); + out[11] = _mm256_sub_epi16(u28, u27); + out[12] = _mm256_add_epi16(u27, u28); + out[13] = _mm256_add_epi16(u26, u29); + out[14] = _mm256_add_epi16(u25, u30); + out[15] = _mm256_add_epi16(u24, u31); + + { + const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); + const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); + butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); + butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); + butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); + } +} + +static void idct32_full_16x32_quarter_1_2(const __m256i *in /*in[32]*/, + __m256i *out /*out[32]*/) { + __m256i temp[16]; + idct32_full_16x32_quarter_1(in, temp); + idct32_full_16x32_quarter_2(in, &temp[8]); + add_sub_butterfly(temp, out, 16); +} + +static void idct32_16x32(const __m256i *in /*in[32]*/, + __m256i *out /*out[32]*/) { + __m256i temp[32]; + idct32_full_16x32_quarter_1_2(in, temp); + idct32_full_16x32_quarter_3_4(in, &temp[16]); + add_sub_butterfly(temp, out, 32); +} + +void aom_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i col[64], in[32]; + int i; + + for (i = 0; i < 2; ++i) { + load_buffer_32x16(input, in); + input += 32 << 4; + + mm256_transpose_16x16(in, in); + mm256_transpose_16x16(&in[16], &in[16]); + idct32_16x32(in, col + (i << 5)); + } + + for (i = 0; i < 2; ++i) { + int j = i << 4; + mm256_transpose_16x16(col + j, in); + mm256_transpose_16x16(col + j + 32, &in[16]); + idct32_16x32(in, in); + store_buffer_16xN(in, stride, dest, 32); + dest += 16; + } +} + +// Group the coefficient calculation into smaller functions +// to prevent stack spillover: +// quarter_1: 0-7 +// quarter_2: 8-15 +// quarter_3_4: 16-23, 24-31 +static void idct32_16x32_135_quarter_1(const __m256i *in /*in[16]*/, + __m256i *out /*out[8]*/) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + + { + const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m256i stk4_2 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); + const __m256i stk4_3 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); + u0 = _mm256_mulhrs_epi16(in[0], stk4_0); + u2 = _mm256_mulhrs_epi16(in[8], stk4_2); + u3 = _mm256_mulhrs_epi16(in[8], stk4_3); + u1 = u0; + } + + v0 = _mm256_add_epi16(u0, u3); + v1 = _mm256_add_epi16(u1, u2); + v2 = _mm256_sub_epi16(u1, u2); + v3 = _mm256_sub_epi16(u0, u3); + + { + const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + const __m256i stk3_2 = + pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); + const __m256i stk3_3 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); + u4 = _mm256_mulhrs_epi16(in[4], stk3_0); + u7 = _mm256_mulhrs_epi16(in[4], stk3_1); + u5 = _mm256_mulhrs_epi16(in[12], stk3_2); + u6 = _mm256_mulhrs_epi16(in[12], stk3_3); + } + + v4 = _mm256_add_epi16(u4, u5); + v5 = _mm256_sub_epi16(u4, u5); + v6 = _mm256_sub_epi16(u7, u6); + v7 = _mm256_add_epi16(u7, u6); + + { + const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); + const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); + } + + out[0] = _mm256_add_epi16(v0, v7); + out[1] = _mm256_add_epi16(v1, v6); + out[2] = _mm256_add_epi16(v2, v5); + out[3] = _mm256_add_epi16(v3, v4); + out[4] = _mm256_sub_epi16(v3, v4); + out[5] = _mm256_sub_epi16(v2, v5); + out[6] = _mm256_sub_epi16(v1, v6); + out[7] = _mm256_sub_epi16(v0, v7); +} + +static void idct32_16x32_135_quarter_2(const __m256i *in /*in[16]*/, + __m256i *out /*out[8]*/) { + __m256i u8, u9, u10, u11, u12, u13, u14, u15; + __m256i v8, v9, v10, v11, v12, v13, v14, v15; + + { + const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); + const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); + const __m256i stk2_2 = + pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); + const __m256i stk2_3 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); + const __m256i stk2_4 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); + const __m256i stk2_5 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); + const __m256i stk2_6 = + pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); + const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); + u8 = _mm256_mulhrs_epi16(in[2], stk2_0); + u15 = _mm256_mulhrs_epi16(in[2], stk2_1); + u9 = _mm256_mulhrs_epi16(in[14], stk2_2); + u14 = _mm256_mulhrs_epi16(in[14], stk2_3); + u10 = _mm256_mulhrs_epi16(in[10], stk2_4); + u13 = _mm256_mulhrs_epi16(in[10], stk2_5); + u11 = _mm256_mulhrs_epi16(in[6], stk2_6); + u12 = _mm256_mulhrs_epi16(in[6], stk2_7); + } + + v8 = _mm256_add_epi16(u8, u9); + v9 = _mm256_sub_epi16(u8, u9); + v10 = _mm256_sub_epi16(u11, u10); + v11 = _mm256_add_epi16(u11, u10); + v12 = _mm256_add_epi16(u12, u13); + v13 = _mm256_sub_epi16(u12, u13); + v14 = _mm256_sub_epi16(u15, u14); + v15 = _mm256_add_epi16(u15, u14); + + { + const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&v9, &v14, &stg4_4, &stg4_5); + butterfly_self(&v10, &v13, &stg4_6, &stg4_4); + } + + out[0] = _mm256_add_epi16(v8, v11); + out[1] = _mm256_add_epi16(v9, v10); + out[2] = _mm256_sub_epi16(v9, v10); + out[3] = _mm256_sub_epi16(v8, v11); + out[4] = _mm256_sub_epi16(v15, v12); + out[5] = _mm256_sub_epi16(v14, v13); + out[6] = _mm256_add_epi16(v14, v13); + out[7] = _mm256_add_epi16(v15, v12); + + { + const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); + const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); + butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); + butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); + } +} + +// 8x32 block even indexed 8 inputs of in[16], +// output first half 16 to out[32] +static void idct32_16x32_quarter_1_2(const __m256i *in /*in[16]*/, + __m256i *out /*out[32]*/) { + __m256i temp[16]; + idct32_16x32_135_quarter_1(in, temp); + idct32_16x32_135_quarter_2(in, &temp[8]); + add_sub_butterfly(temp, out, 16); +} + +// 8x32 block odd indexed 8 inputs of in[16], +// output second half 16 to out[32] +static void idct32_16x32_quarter_3_4(const __m256i *in /*in[16]*/, + __m256i *out /*out[32]*/) { + __m256i v16, v17, v18, v19, v20, v21, v22, v23; + __m256i v24, v25, v26, v27, v28, v29, v30, v31; + __m256i u16, u17, u18, u19, u20, u21, u22, u23; + __m256i u24, u25, u26, u27, u28, u29, u30, u31; + + { + const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); + const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); + const __m256i stk1_2 = + pair256_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64); + const __m256i stk1_3 = pair256_set_epi16(2 * cospi_15_64, 2 * cospi_15_64); + + const __m256i stk1_4 = pair256_set_epi16(2 * cospi_23_64, 2 * cospi_23_64); + const __m256i stk1_5 = pair256_set_epi16(2 * cospi_9_64, 2 * cospi_9_64); + const __m256i stk1_6 = + pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); + const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); + const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); + const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); + const __m256i stk1_10 = + pair256_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64); + const __m256i stk1_11 = pair256_set_epi16(2 * cospi_11_64, 2 * cospi_11_64); + + const __m256i stk1_12 = pair256_set_epi16(2 * cospi_19_64, 2 * cospi_19_64); + const __m256i stk1_13 = pair256_set_epi16(2 * cospi_13_64, 2 * cospi_13_64); + const __m256i stk1_14 = + pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); + const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); + u16 = _mm256_mulhrs_epi16(in[1], stk1_0); + u31 = _mm256_mulhrs_epi16(in[1], stk1_1); + u17 = _mm256_mulhrs_epi16(in[15], stk1_2); + u30 = _mm256_mulhrs_epi16(in[15], stk1_3); + + u18 = _mm256_mulhrs_epi16(in[9], stk1_4); + u29 = _mm256_mulhrs_epi16(in[9], stk1_5); + u19 = _mm256_mulhrs_epi16(in[7], stk1_6); + u28 = _mm256_mulhrs_epi16(in[7], stk1_7); + + u20 = _mm256_mulhrs_epi16(in[5], stk1_8); + u27 = _mm256_mulhrs_epi16(in[5], stk1_9); + u21 = _mm256_mulhrs_epi16(in[11], stk1_10); + u26 = _mm256_mulhrs_epi16(in[11], stk1_11); + + u22 = _mm256_mulhrs_epi16(in[13], stk1_12); + u25 = _mm256_mulhrs_epi16(in[13], stk1_13); + u23 = _mm256_mulhrs_epi16(in[3], stk1_14); + u24 = _mm256_mulhrs_epi16(in[3], stk1_15); + } + + v16 = _mm256_add_epi16(u16, u17); + v17 = _mm256_sub_epi16(u16, u17); + v18 = _mm256_sub_epi16(u19, u18); + v19 = _mm256_add_epi16(u19, u18); + + v20 = _mm256_add_epi16(u20, u21); + v21 = _mm256_sub_epi16(u20, u21); + v22 = _mm256_sub_epi16(u23, u22); + v23 = _mm256_add_epi16(u23, u22); + + v24 = _mm256_add_epi16(u24, u25); + v25 = _mm256_sub_epi16(u24, u25); + v26 = _mm256_sub_epi16(u27, u26); + v27 = _mm256_add_epi16(u27, u26); + + v28 = _mm256_add_epi16(u28, u29); + v29 = _mm256_sub_epi16(u28, u29); + v30 = _mm256_sub_epi16(u31, u30); + v31 = _mm256_add_epi16(u31, u30); + + { + const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64); + const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64); + const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); + const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64); + const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64); + const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); + + butterfly_self(&v17, &v30, &stg3_4, &stg3_5); + butterfly_self(&v18, &v29, &stg3_6, &stg3_4); + butterfly_self(&v21, &v26, &stg3_8, &stg3_9); + butterfly_self(&v22, &v25, &stg3_10, &stg3_8); + } + + u16 = _mm256_add_epi16(v16, v19); + u17 = _mm256_add_epi16(v17, v18); + u18 = _mm256_sub_epi16(v17, v18); + u19 = _mm256_sub_epi16(v16, v19); + u20 = _mm256_sub_epi16(v23, v20); + u21 = _mm256_sub_epi16(v22, v21); + u22 = _mm256_add_epi16(v22, v21); + u23 = _mm256_add_epi16(v23, v20); + + u24 = _mm256_add_epi16(v24, v27); + u25 = _mm256_add_epi16(v25, v26); + u26 = _mm256_sub_epi16(v25, v26); + u27 = _mm256_sub_epi16(v24, v27); + u28 = _mm256_sub_epi16(v31, v28); + u29 = _mm256_sub_epi16(v30, v29); + u30 = _mm256_add_epi16(v29, v30); + u31 = _mm256_add_epi16(v28, v31); + + { + const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); + } + + out[0] = _mm256_add_epi16(u16, u23); + out[1] = _mm256_add_epi16(u17, u22); + out[2] = _mm256_add_epi16(u18, u21); + out[3] = _mm256_add_epi16(u19, u20); + v20 = _mm256_sub_epi16(u19, u20); + v21 = _mm256_sub_epi16(u18, u21); + v22 = _mm256_sub_epi16(u17, u22); + v23 = _mm256_sub_epi16(u16, u23); + + v24 = _mm256_sub_epi16(u31, u24); + v25 = _mm256_sub_epi16(u30, u25); + v26 = _mm256_sub_epi16(u29, u26); + v27 = _mm256_sub_epi16(u28, u27); + out[12] = _mm256_add_epi16(u27, u28); + out[13] = _mm256_add_epi16(u26, u29); + out[14] = _mm256_add_epi16(u25, u30); + out[15] = _mm256_add_epi16(u24, u31); + + { + const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); + const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); + butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]); + butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]); + butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]); + butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]); + } +} + +// 16x16 block input __m256i in[32], output 16x32 __m256i in[32] +static void idct32_16x32_135(__m256i *in /*in[32]*/) { + __m256i out[32]; + idct32_16x32_quarter_1_2(in, out); + idct32_16x32_quarter_3_4(in, &out[16]); + add_sub_butterfly(out, in, 32); +} + +static INLINE void load_buffer_from_32x32(const tran_low_t *coeff, __m256i *in, + int size) { + int i = 0; + while (i < size) { + load_coeff(coeff + (i << 5), &in[i]); + i += 1; + } +} + +static INLINE void zero_buffer(__m256i *in, int num) { + int i; + for (i = 0; i < num; ++i) { + in[i] = _mm256_setzero_si256(); + } +} + +// Only upper-left 16x16 has non-zero coeff +void aom_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i in[32]; + zero_buffer(in, 32); + load_buffer_from_32x32(input, in, 16); + mm256_transpose_16x16(in, in); + idct32_16x32_135(in); + + __m256i out[32]; + mm256_transpose_16x16(in, out); + idct32_16x32_135(out); + store_buffer_16xN(out, stride, dest, 32); + mm256_transpose_16x16(&in[16], in); + idct32_16x32_135(in); + store_buffer_16xN(in, stride, dest + 16, 32); +} + +static void idct32_34_first_half(const __m256i *in, __m256i *stp1) { + const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); + const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); + const __m256i stk2_6 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); + const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); + + const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + + const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); + const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64); + const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1, x4, x5, x6, x7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; + + // phase 1 + + // 0, 15 + u2 = _mm256_mulhrs_epi16(in[2], stk2_1); // stp2_15 + u3 = _mm256_mulhrs_epi16(in[6], stk2_7); // stp2_12 + v15 = _mm256_add_epi16(u2, u3); + // in[0], in[4] + x0 = _mm256_mulhrs_epi16(in[0], stk4_0); // stp1[0] + x7 = _mm256_mulhrs_epi16(in[4], stk3_1); // stp1[7] + v0 = _mm256_add_epi16(x0, x7); // stp2_0 + stp1[0] = _mm256_add_epi16(v0, v15); + stp1[15] = _mm256_sub_epi16(v0, v15); + + // in[2], in[6] + u0 = _mm256_mulhrs_epi16(in[2], stk2_0); // stp2_8 + u1 = _mm256_mulhrs_epi16(in[6], stk2_6); // stp2_11 + butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14 + butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13 + + v8 = _mm256_add_epi16(u0, u1); + v9 = _mm256_add_epi16(u4, u6); + v10 = _mm256_sub_epi16(u4, u6); + v11 = _mm256_sub_epi16(u0, u1); + v12 = _mm256_sub_epi16(u2, u3); + v13 = _mm256_sub_epi16(u5, u7); + v14 = _mm256_add_epi16(u5, u7); + + butterfly_self(&v10, &v13, &stg6_0, &stg4_0); + butterfly_self(&v11, &v12, &stg6_0, &stg4_0); + + // 1, 14 + x1 = _mm256_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0 + // stp1[2] = stp1[0], stp1[3] = stp1[1] + x4 = _mm256_mulhrs_epi16(in[4], stk3_0); // stp1[4] + butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6); + v1 = _mm256_add_epi16(x1, x6); // stp2_1 + v2 = _mm256_add_epi16(x0, x5); // stp2_2 + stp1[1] = _mm256_add_epi16(v1, v14); + stp1[14] = _mm256_sub_epi16(v1, v14); + + stp1[2] = _mm256_add_epi16(v2, v13); + stp1[13] = _mm256_sub_epi16(v2, v13); + + v3 = _mm256_add_epi16(x1, x4); // stp2_3 + v4 = _mm256_sub_epi16(x1, x4); // stp2_4 + + v5 = _mm256_sub_epi16(x0, x5); // stp2_5 + + v6 = _mm256_sub_epi16(x1, x6); // stp2_6 + v7 = _mm256_sub_epi16(x0, x7); // stp2_7 + stp1[3] = _mm256_add_epi16(v3, v12); + stp1[12] = _mm256_sub_epi16(v3, v12); + + stp1[6] = _mm256_add_epi16(v6, v9); + stp1[9] = _mm256_sub_epi16(v6, v9); + + stp1[7] = _mm256_add_epi16(v7, v8); + stp1[8] = _mm256_sub_epi16(v7, v8); + + stp1[4] = _mm256_add_epi16(v4, v11); + stp1[11] = _mm256_sub_epi16(v4, v11); + + stp1[5] = _mm256_add_epi16(v5, v10); + stp1[10] = _mm256_sub_epi16(v5, v10); +} + +static void idct32_34_second_half(const __m256i *in, __m256i *stp1) { + const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); + const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); + const __m256i stk1_6 = pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); + const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); + const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); + const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); + const __m256i stk1_14 = pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); + const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); + const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64); + const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64); + const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); + const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64); + const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64); + const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); + const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); + const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); + const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); + __m256i v16, v17, v18, v19, v20, v21, v22, v23; + __m256i v24, v25, v26, v27, v28, v29, v30, v31; + __m256i u16, u17, u18, u19, u20, u21, u22, u23; + __m256i u24, u25, u26, u27, u28, u29, u30, u31; + + v16 = _mm256_mulhrs_epi16(in[1], stk1_0); + v31 = _mm256_mulhrs_epi16(in[1], stk1_1); + + v19 = _mm256_mulhrs_epi16(in[7], stk1_6); + v28 = _mm256_mulhrs_epi16(in[7], stk1_7); + + v20 = _mm256_mulhrs_epi16(in[5], stk1_8); + v27 = _mm256_mulhrs_epi16(in[5], stk1_9); + + v23 = _mm256_mulhrs_epi16(in[3], stk1_14); + v24 = _mm256_mulhrs_epi16(in[3], stk1_15); + + butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30); + butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29); + butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26); + butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25); + + u16 = _mm256_add_epi16(v16, v19); + u17 = _mm256_add_epi16(v17, v18); + u18 = _mm256_sub_epi16(v17, v18); + u19 = _mm256_sub_epi16(v16, v19); + u20 = _mm256_sub_epi16(v23, v20); + u21 = _mm256_sub_epi16(v22, v21); + u22 = _mm256_add_epi16(v22, v21); + u23 = _mm256_add_epi16(v23, v20); + u24 = _mm256_add_epi16(v24, v27); + u27 = _mm256_sub_epi16(v24, v27); + u25 = _mm256_add_epi16(v25, v26); + u26 = _mm256_sub_epi16(v25, v26); + u28 = _mm256_sub_epi16(v31, v28); + u31 = _mm256_add_epi16(v28, v31); + u29 = _mm256_sub_epi16(v30, v29); + u30 = _mm256_add_epi16(v29, v30); + + butterfly_self(&u18, &u29, &stg4_4, &stg4_5); + butterfly_self(&u19, &u28, &stg4_4, &stg4_5); + butterfly_self(&u20, &u27, &stg4_6, &stg4_4); + butterfly_self(&u21, &u26, &stg4_6, &stg4_4); + + stp1[0] = _mm256_add_epi16(u16, u23); + stp1[7] = _mm256_sub_epi16(u16, u23); + + stp1[1] = _mm256_add_epi16(u17, u22); + stp1[6] = _mm256_sub_epi16(u17, u22); + + stp1[2] = _mm256_add_epi16(u18, u21); + stp1[5] = _mm256_sub_epi16(u18, u21); + + stp1[3] = _mm256_add_epi16(u19, u20); + stp1[4] = _mm256_sub_epi16(u19, u20); + + stp1[8] = _mm256_sub_epi16(u31, u24); + stp1[15] = _mm256_add_epi16(u24, u31); + + stp1[9] = _mm256_sub_epi16(u30, u25); + stp1[14] = _mm256_add_epi16(u25, u30); + + stp1[10] = _mm256_sub_epi16(u29, u26); + stp1[13] = _mm256_add_epi16(u26, u29); + + stp1[11] = _mm256_sub_epi16(u28, u27); + stp1[12] = _mm256_add_epi16(u27, u28); + + butterfly_self(&stp1[4], &stp1[11], &stg6_0, &stg4_0); + butterfly_self(&stp1[5], &stp1[10], &stg6_0, &stg4_0); + butterfly_self(&stp1[6], &stp1[9], &stg6_0, &stg4_0); + butterfly_self(&stp1[7], &stp1[8], &stg6_0, &stg4_0); +} + +// 16x16 block input __m256i in[32], output 16x32 __m256i in[32] +static void idct32_16x32_34(__m256i *in /*in[32]*/) { + __m256i out[32]; + idct32_34_first_half(in, out); + idct32_34_second_half(in, &out[16]); + add_sub_butterfly(out, in, 32); +} + +// Only upper-left 8x8 has non-zero coeff +void aom_idct32x32_34_add_avx2(const tran_low_t *input, uint8_t *dest, + int stride) { + __m256i in[32]; + zero_buffer(in, 32); + load_buffer_from_32x32(input, in, 8); + mm256_transpose_16x16(in, in); + idct32_16x32_34(in); + + __m256i out[32]; + mm256_transpose_16x16(in, out); + idct32_16x32_34(out); + store_buffer_16xN(out, stride, dest, 32); + mm256_transpose_16x16(&in[16], in); + idct32_16x32_34(in); + store_buffer_16xN(in, stride, dest + 16, 32); +} diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h new file mode 100644 index 000000000..4238e651b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H +#define AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H + +#include + +#include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) { +#if CONFIG_HIGHBITDEPTH + *in = _mm256_setr_epi16( + (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], + (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], + (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], + (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], + (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], + (int16_t)coeff[15]); +#else + *in = _mm256_loadu_si256((const __m256i *)coeff); +#endif +} + +static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) { + int i = 0; + while (i < 16) { + load_coeff(coeff + (i << 4), &in[i]); + i += 1; + } +} + +static INLINE void recon_and_store(const __m256i *res, uint8_t *output) { + const __m128i zero = _mm_setzero_si128(); + __m128i x = _mm_loadu_si128((__m128i const *)output); + __m128i p0 = _mm_unpacklo_epi8(x, zero); + __m128i p1 = _mm_unpackhi_epi8(x, zero); + + p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res)); + p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1)); + x = _mm_packus_epi16(p0, p1); + _mm_storeu_si128((__m128i *)output, x); +} + +#define IDCT_ROUNDING_POS (6) +static INLINE void store_buffer_16xN(__m256i *in, const int stride, + uint8_t *output, int num) { + const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1)); + int i = 0; + + while (i < num) { + in[i] = _mm256_adds_epi16(in[i], rounding); + in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS); + recon_and_store(&in[i], output + i * stride); + i += 1; + } +} + +static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1, + const __m256i *c0, const __m256i *c1, + __m256i *b0, __m256i *b1) { + __m256i x0, x1; + x0 = _mm256_unpacklo_epi16(*a0, *a1); + x1 = _mm256_unpackhi_epi16(*a0, *a1); + *b0 = butter_fly(&x0, &x1, c0); + *b1 = butter_fly(&x0, &x1, c1); +} + +void av1_idct16_avx2(__m256i *in); + +#endif // AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c index 5795a1845..be200df4c 100644 --- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c +++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c @@ -3628,4 +3628,107 @@ void aom_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, } } +void aom_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + int i, j, test; + __m128i inptr[8]; + __m128i min_input, max_input, temp1, temp2, sign_bits; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + const __m128i zero = _mm_set1_epi16(0); + const __m128i sixteen = _mm_set1_epi16(16); + const __m128i max = _mm_set1_epi16(6201); + const __m128i min = _mm_set1_epi16(-6201); + int optimised_cols = 0; + + // Load input into __m128i & pack to 16 bits + for (i = 0; i < 8; i++) { + temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); + temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); + inptr[i] = _mm_packs_epi32(temp1, temp2); + } + + // Find the min & max for the row transform + // only first 4 row has non-zero coefs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 4; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (!test) { + // Do the row transform + aom_idct8_sse2(inptr); + + // Find the min & max for the column transform + // N.B. Only first 4 cols contain non-zero coeffs + max_input = _mm_max_epi16(inptr[0], inptr[1]); + min_input = _mm_min_epi16(inptr[0], inptr[1]); + for (i = 2; i < 8; i++) { + max_input = _mm_max_epi16(max_input, inptr[i]); + min_input = _mm_min_epi16(min_input, inptr[i]); + } + max_input = _mm_cmpgt_epi16(max_input, max); + min_input = _mm_cmplt_epi16(min_input, min); + temp1 = _mm_or_si128(max_input, min_input); + test = _mm_movemask_epi8(temp1); + + if (test) { + // Use fact only first 4 rows contain non-zero coeffs + array_transpose_4X8(inptr, inptr); + for (i = 0; i < 4; i++) { + sign_bits = _mm_cmplt_epi16(inptr[i], zero); + temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); + temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); + _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); + } + } else { + // Set to use the optimised transform for the column + optimised_cols = 1; + } + } else { + // Run the un-optimised row transform + for (i = 0; i < 4; ++i) { + aom_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + } + + if (optimised_cols) { + aom_idct8_sse2(inptr); + + // Final round & shift and Reconstruction and Store + { + __m128i d[8]; + for (i = 0; i < 8; i++) { + inptr[i] = _mm_add_epi16(inptr[i], sixteen); + d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); + inptr[i] = _mm_srai_epi16(inptr[i], 5); + d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); + // Store + _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); + } + } + } else { + // Run the un-optimised column transform + tran_low_t temp_in[8], temp_out[8]; + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + aom_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } + } +} + #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c index 5166e9e0a..9d16a3e84 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * Copyright (c) 2017, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License @@ -9,49 +9,70 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include -#include +#include #include -#include "aom_ports/mem.h" #include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/blend.h" #include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" -static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) { - __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); - __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); - return _mm_unpacklo_epi64(temp1, temp2); -} - -static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) { - __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr); - __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride)); - __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2); - temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2)); - temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3)); - temp1 = _mm_unpacklo_epi32(temp1, temp2); - return _mm_unpacklo_epi64(temp3, temp1); -} - -static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, +// For width a multiple of 16 +static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, + int src_stride, + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height); static INLINE unsigned int masked_sad8xh_ssse3( - const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height); + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int height); static INLINE unsigned int masked_sad4xh_ssse3( - const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height); - -#define MASKSADMXN_SSSE3(m, n) \ - unsigned int aom_masked_sad##m##x##n##_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *msk, int msk_stride) { \ - return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \ - m, n); \ + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int height); + +#define MASKSADMXN_SSSE3(m, n) \ + unsigned int aom_masked_sad##m##x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \ + m, msk, msk_stride, m, n); \ + else \ + return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \ + ref_stride, msk, msk_stride, m, n); \ + } + +#define MASKSAD8XN_SSSE3(n) \ + unsigned int aom_masked_sad8x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 8, msk, msk_stride, n); \ + else \ + return masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ + ref_stride, msk, msk_stride, n); \ + } + +#define MASKSAD4XN_SSSE3(n) \ + unsigned int aom_masked_sad4x##n##_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 4, msk, msk_stride, n); \ + else \ + return masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ + ref_stride, msk, msk_stride, n); \ } #if CONFIG_EXT_PARTITION @@ -67,165 +88,181 @@ MASKSADMXN_SSSE3(32, 16) MASKSADMXN_SSSE3(16, 32) MASKSADMXN_SSSE3(16, 16) MASKSADMXN_SSSE3(16, 8) - -#define MASKSAD8XN_SSSE3(n) \ - unsigned int aom_masked_sad8x##n##_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *msk, int msk_stride) { \ - return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \ - msk_stride, n); \ - } - MASKSAD8XN_SSSE3(16) MASKSAD8XN_SSSE3(8) MASKSAD8XN_SSSE3(4) - -#define MASKSAD4XN_SSSE3(n) \ - unsigned int aom_masked_sad4x##n##_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *msk, int msk_stride) { \ - return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ - msk_stride, n); \ - } - MASKSAD4XN_SSSE3(8) MASKSAD4XN_SSSE3(4) -// For width a multiple of 16 -// Assumes values in m are <=64 -static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride, +static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, + int src_stride, + const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { - int y, x; - __m128i a, b, m, temp1, temp2; + int x, y; __m128i res = _mm_setzero_si128(); - __m128i one = _mm_set1_epi16(1); - // For each row + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y++) { - // Covering the full width for (x = 0; x < width; x += 16) { - // Load a, b, m in xmm registers - a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); - b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); - m = _mm_loadu_si128((const __m128i *)(m_ptr + x)); - - // Calculate the difference between a & b - temp1 = _mm_subs_epu8(a, b); - temp2 = _mm_subs_epu8(b, a); - temp1 = _mm_or_si128(temp1, temp2); - - // Multiply by m and add together - temp2 = _mm_maddubs_epi16(temp1, m); - // Pad out row result to 32 bit integers & add to running total - res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one)); + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m128i data_l = _mm_unpacklo_epi8(a, b); + const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi8(a, b); + const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_l, pred_r); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); } - // Move onto the next row + + src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } - res = _mm_hadd_epi32(res, _mm_setzero_si128()); - res = _mm_hadd_epi32(res, _mm_setzero_si128()); - // sad = (sad + 31) >> 6; - return (_mm_cvtsi128_si32(res) + 31) >> 6; + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + int32_t sad = + _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); + return (sad + 31) >> 6; } static INLINE unsigned int masked_sad8xh_ssse3( - const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height) { + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { int y; - __m128i a, b, m, temp1, temp2, row_res; __m128i res = _mm_setzero_si128(); - __m128i one = _mm_set1_epi16(1); - // Add the masked SAD for 2 rows at a time - for (y = 0; y < height; y += 2) { - // Load a, b, m in xmm registers - a = width8_load_2rows(a_ptr, a_stride); - b = width8_load_2rows(b_ptr, b_stride); - m = width8_load_2rows(m_ptr, m_stride); - - // Calculate the difference between a & b - temp1 = _mm_subs_epu8(a, b); - temp2 = _mm_subs_epu8(b, a); - temp1 = _mm_or_si128(temp1, temp2); - - // Multiply by m and add together - row_res = _mm_maddubs_epi16(temp1, m); - - // Pad out row result to 32 bit integers & add to running total - res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); - // Move onto the next rows + for (y = 0; y < height; y += 2) { + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); + const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); + const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); + const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); + const __m128i m = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), + _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi8(a0, b0); + const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpacklo_epi8(a1, b1); + const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_l, pred_r); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + + src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } - res = _mm_hadd_epi32(res, _mm_setzero_si128()); - res = _mm_hadd_epi32(res, _mm_setzero_si128()); - // sad = (sad + 31) >> 6; - return (_mm_cvtsi128_si32(res) + 31) >> 6; + int32_t sad = + _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); + return (sad + 31) >> 6; } static INLINE unsigned int masked_sad4xh_ssse3( - const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height) { + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { int y; - __m128i a, b, m, temp1, temp2, row_res; __m128i res = _mm_setzero_si128(); - __m128i one = _mm_set1_epi16(1); - // Add the masked SAD for 4 rows at a time - for (y = 0; y < height; y += 4) { - // Load a, b, m in xmm registers - a = width4_load_4rows(a_ptr, a_stride); - b = width4_load_4rows(b_ptr, b_stride); - m = width4_load_4rows(m_ptr, m_stride); - - // Calculate the difference between a & b - temp1 = _mm_subs_epu8(a, b); - temp2 = _mm_subs_epu8(b, a); - temp1 = _mm_or_si128(temp1, temp2); - - // Multiply by m and add together - row_res = _mm_maddubs_epi16(temp1, m); - - // Pad out row result to 32 bit integers & add to running total - res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one)); - - // Move onto the next rows - a_ptr += a_stride * 4; - b_ptr += b_stride * 4; - m_ptr += m_stride * 4; + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + + for (y = 0; y < height; y += 2) { + // Load two rows at a time, this seems to be a bit faster + // than four rows at a time in this case. + const __m128i src = _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(uint32_t *)src_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride])); + const __m128i a = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride])); + const __m128i b = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride])); + const __m128i m = + _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride])); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + const __m128i data = _mm_unpacklo_epi8(a, b); + const __m128i mask = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_16bit = _mm_maddubs_epi16(data, mask); + pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); + res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); + + src_ptr += src_stride * 2; + a_ptr += a_stride * 2; + b_ptr += b_stride * 2; + m_ptr += m_stride * 2; } - // Pad out row result to 32 bit integers & add to running total - res = _mm_hadd_epi32(res, _mm_setzero_si128()); - res = _mm_hadd_epi32(res, _mm_setzero_si128()); - // sad = (sad + 31) >> 6; - return (_mm_cvtsi128_si32(res) + 31) >> 6; + // At this point, the SAD is stored in lane 0 of 'res' + int32_t sad = _mm_cvtsi128_si32(res); + return (sad + 31) >> 6; } #if CONFIG_HIGHBITDEPTH -static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr, - int stride) { - __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr); - __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride)); - return _mm_unpacklo_epi64(temp1, temp2); -} - +// For width a multiple of 8 static INLINE unsigned int highbd_masked_sad_ssse3( - const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int width, int height); + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height); static INLINE unsigned int highbd_masked_sad4xh_ssse3( - const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height); + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int height); #define HIGHBD_MASKSADMXN_SSSE3(m, n) \ unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *msk, int msk_stride) { \ - return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \ - msk_stride, m, n); \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \ + second_pred8, m, msk, msk_stride, m, n); \ + else \ + return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \ + ref_stride, msk, msk_stride, m, n); \ + } + +#define HIGHBD_MASKSAD4XN_SSSE3(n) \ + unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, ref_stride, \ + second_pred8, 4, msk, msk_stride, n); \ + else \ + return highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ + ref8, ref_stride, msk, msk_stride, n); \ } #if CONFIG_EXT_PARTITION @@ -244,91 +281,124 @@ HIGHBD_MASKSADMXN_SSSE3(16, 8) HIGHBD_MASKSADMXN_SSSE3(8, 16) HIGHBD_MASKSADMXN_SSSE3(8, 8) HIGHBD_MASKSADMXN_SSSE3(8, 4) - -#define HIGHBD_MASKSAD4XN_SSSE3(n) \ - unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *msk, int msk_stride) { \ - return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \ - msk_stride, n); \ - } - HIGHBD_MASKSAD4XN_SSSE3(8) HIGHBD_MASKSAD4XN_SSSE3(4) -// For width a multiple of 8 -// Assumes values in m are <=64 static INLINE unsigned int highbd_masked_sad_ssse3( - const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int width, int height) { - int y, x; - __m128i a, b, m, temp1, temp2; - const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); - const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int x, y; __m128i res = _mm_setzero_si128(); - // For each row + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i one = _mm_set1_epi16(1); + for (y = 0; y < height; y++) { - // Covering the full width for (x = 0; x < width; x += 8) { - // Load a, b, m in xmm registers - a = _mm_loadu_si128((const __m128i *)(a_ptr + x)); - b = _mm_loadu_si128((const __m128i *)(b_ptr + x)); - m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)), - _mm_setzero_si128()); - - // Calculate the difference between a & b - temp1 = _mm_subs_epu16(a, b); - temp2 = _mm_subs_epu16(b, a); - temp1 = _mm_or_si128(temp1, temp2); - - // Add result of multiplying by m and add pairs together to running total - res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + // Zero-extend mask to 16 bits + const __m128i m = _mm_unpacklo_epi8( + _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m128i pred = _mm_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); + res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); } - // Move onto the next row + + src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } - res = _mm_hadd_epi32(res, _mm_setzero_si128()); - res = _mm_hadd_epi32(res, _mm_setzero_si128()); - // sad = (sad + 31) >> 6; - return (_mm_cvtsi128_si32(res) + 31) >> 6; + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm_hadd_epi32(res, res); + res = _mm_hadd_epi32(res, res); + int sad = _mm_cvtsi128_si32(res); + return (sad + 31) >> 6; } static INLINE unsigned int highbd_masked_sad4xh_ssse3( - const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height) { + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; - __m128i a, b, m, temp1, temp2; - const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr); - const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr); __m128i res = _mm_setzero_si128(); - // Add the masked SAD for 2 rows at a time + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i one = _mm_set1_epi16(1); + for (y = 0; y < height; y += 2) { - // Load a, b, m in xmm registers - a = highbd_width4_load_2rows(a_ptr, a_stride); - b = highbd_width4_load_2rows(b_ptr, b_stride); - temp1 = _mm_loadl_epi64((const __m128i *)m_ptr); - temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)); - m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2), - _mm_setzero_si128()); - - // Calculate the difference between a & b - temp1 = _mm_subs_epu16(a, b); - temp2 = _mm_subs_epu16(b, a); - temp1 = _mm_or_si128(temp1, temp2); - - // Multiply by m and add together - res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m)); - - // Move onto the next rows + const __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), + _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); + const __m128i b = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), + _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); + // Zero-extend mask to 16 bits + const __m128i m = _mm_unpacklo_epi8( + _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), + _mm_setzero_si128()); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i pred = _mm_packs_epi32(pred_l, pred_r); + const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); + res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); + + src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } - res = _mm_hadd_epi32(res, _mm_setzero_si128()); - res = _mm_hadd_epi32(res, _mm_setzero_si128()); - // sad = (sad + 31) >> 6; - return (_mm_cvtsi128_si32(res) + 31) >> 6; + res = _mm_hadd_epi32(res, res); + res = _mm_hadd_epi32(res, res); + int sad = _mm_cvtsi128_si32(res); + return (sad + 31) >> 6; } -#endif // CONFIG_HIGHBITDEPTH + +#endif diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c index fe14597f6..be9d437d2 100644 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * Copyright (c) 2017, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License @@ -9,1940 +9,1003 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include #include -#include +#include #include #include "./aom_config.h" +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/blend.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_filter.h" - -// Half pixel shift -#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS / 2) - -/***************************************************************************** - * Horizontal additions - *****************************************************************************/ - -static INLINE int32_t hsum_epi32_si32(__m128i v_d) { - v_d = _mm_hadd_epi32(v_d, v_d); - v_d = _mm_hadd_epi32(v_d, v_d); - return _mm_cvtsi128_si32(v_d); -} - -static INLINE int64_t hsum_epi64_si64(__m128i v_q) { - v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); -#if ARCH_X86_64 - return _mm_cvtsi128_si64(v_q); -#else - { - int64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, v_q); - return tmp; - } -#endif -} - -#if CONFIG_HIGHBITDEPTH -static INLINE int64_t hsum_epi32_si64(__m128i v_d) { - const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); - const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); - const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); - return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); -} -#endif // CONFIG_HIGHBITDEPTH - -static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q, - uint32_t *sse, int w, int h) { - int64_t sum64; - uint64_t sse64; - - // Horizontal sum - sum64 = hsum_epi32_si32(v_sum_d); - sse64 = hsum_epi64_si64(v_sse_q); - - sum64 = (sum64 >= 0) ? sum64 : -sum64; - - // Round - sum64 = ROUND_POWER_OF_TWO(sum64, 6); - sse64 = ROUND_POWER_OF_TWO(sse64, 12); - - // Store the SSE - *sse = (uint32_t)sse64; - // Compute the variance - return *sse - (uint32_t)((sum64 * sum64) / (w * h)); -} - -/***************************************************************************** - * n*16 Wide versions - *****************************************************************************/ - -static INLINE unsigned int masked_variancewxh_ssse3( - const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, - const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { - int ii, jj; - - const __m128i v_zero = _mm_setzero_si128(); - - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - - assert((w % 16) == 0); - - for (ii = 0; ii < h; ii++) { - for (jj = 0; jj < w; jj += 16) { - // Load inputs - 8 bits - const __m128i v_a_b = _mm_loadu_si128((const __m128i *)(a + jj)); - const __m128i v_b_b = _mm_loadu_si128((const __m128i *)(b + jj)); - const __m128i v_m_b = _mm_loadu_si128((const __m128i *)(m + jj)); - - // Unpack to 16 bits - still containing max 8 bits - const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero); - const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero); - const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero); - const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero); - const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero); - const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero); - - // Difference: [-255, 255] - const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w); - const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w); - - // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits - const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w); - const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w); - const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w); - const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w); - - // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits - const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w); - const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w); - - // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits - const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d); - - // Unpack Squared error to 64 bits - const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); - const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); - - // Accumulate - v_sum_d = _mm_add_epi32(v_sum_d, v_e0_d); - v_sum_d = _mm_add_epi32(v_sum_d, v_e1_d); - v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q); - v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q); - } - - // Move on to next row - a += a_stride; - b += b_stride; - m += m_stride; - } - - return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); -} - -#define MASKED_VARWXH(W, H) \ - unsigned int aom_masked_variance##W##x##H##_ssse3( \ - const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, unsigned int *sse) { \ - return masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, m_stride, W, \ - H, sse); \ +#include "aom_dsp/x86/synonyms.h" + +// For width a multiple of 16 +static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int w, int h); + +static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h); + +static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h); + +// For width a multiple of 16 +static void masked_variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, + int height, unsigned int *sse, int *sum_); + +static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_); + +static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_); + +#define MASK_SUBPIX_VAR_SSSE3(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * W]; \ + \ + bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, sse, &sum); \ + else \ + masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define MASK_SUBPIX_VAR8XH_SSSE3(H) \ + unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * 8]; \ + \ + bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ + H, sse, &sum); \ + else \ + masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ + H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \ + } + +#define MASK_SUBPIX_VAR4XH_SSSE3(H) \ + unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + int sum; \ + uint8_t temp[(H + 1) * 4]; \ + \ + bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \ + H, sse, &sum); \ + else \ + masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \ + H, sse, &sum); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ } -MASKED_VARWXH(16, 8) -MASKED_VARWXH(16, 16) -MASKED_VARWXH(16, 32) -MASKED_VARWXH(32, 16) -MASKED_VARWXH(32, 32) -MASKED_VARWXH(32, 64) -MASKED_VARWXH(64, 32) -MASKED_VARWXH(64, 64) #if CONFIG_EXT_PARTITION -MASKED_VARWXH(64, 128) -MASKED_VARWXH(128, 64) -MASKED_VARWXH(128, 128) -#endif // CONFIG_EXT_PARTITION - -/***************************************************************************** - * 8 Wide versions - *****************************************************************************/ - -static INLINE unsigned int masked_variance8xh_ssse3( - const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, - const uint8_t *m, int m_stride, int h, unsigned int *sse) { - int ii; - - const __m128i v_zero = _mm_setzero_si128(); - - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - - for (ii = 0; ii < h; ii++) { - // Load inputs - 8 bits - const __m128i v_a_b = _mm_loadl_epi64((const __m128i *)a); - const __m128i v_b_b = _mm_loadl_epi64((const __m128i *)b); - const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)m); - - // Unpack to 16 bits - still containing max 8 bits - const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero); - const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero); - const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); - - // Difference: [-255, 255] - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - - // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits - const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w); - const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); - - // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits - const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w); - - // Unpack Squared error to 64 bits - const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); - const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); - - // Accumulate - v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); - v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q); - v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q); - - // Move on to next row - a += a_stride; - b += b_stride; - m += m_stride; - } - - return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); -} - -#define MASKED_VAR8XH(H) \ - unsigned int aom_masked_variance8x##H##_ssse3( \ - const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, unsigned int *sse) { \ - return masked_variance8xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \ - sse); \ - } - -MASKED_VAR8XH(4) -MASKED_VAR8XH(8) -MASKED_VAR8XH(16) - -/***************************************************************************** - * 4 Wide versions - *****************************************************************************/ - -static INLINE unsigned int masked_variance4xh_ssse3( - const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, - const uint8_t *m, int m_stride, int h, unsigned int *sse) { - int ii; - - const __m128i v_zero = _mm_setzero_si128(); - - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - - assert((h % 2) == 0); - - for (ii = 0; ii < h / 2; ii++) { - // Load 2 input rows - 8 bits - const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t *)a); - const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t *)b); - const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m); - const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t *)(a + a_stride)); - const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t *)(b + b_stride)); - const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride)); - - // Interleave 2 rows into a single register - const __m128i v_a_b = _mm_unpacklo_epi32(v_a0_b, v_a1_b); - const __m128i v_b_b = _mm_unpacklo_epi32(v_b0_b, v_b1_b); - const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b); - - // Unpack to 16 bits - still containing max 8 bits - const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero); - const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero); - const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); - - // Difference: [-255, 255] - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - - // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits - const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w); - const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); - - // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits - const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w); - - // Unpack Squared error to 64 bits - const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); - const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); - - // Accumulate - v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); - v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q); - v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q); - - // Move on to next 2 row - a += a_stride * 2; - b += b_stride * 2; - m += m_stride * 2; - } - - return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); -} - -#define MASKED_VAR4XH(H) \ - unsigned int aom_masked_variance4x##H##_ssse3( \ - const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, \ - const uint8_t *m, int m_stride, unsigned int *sse) { \ - return masked_variance4xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \ - sse); \ - } - -MASKED_VAR4XH(4) -MASKED_VAR4XH(8) - -#if CONFIG_HIGHBITDEPTH - -// Main calculation for n*8 wide blocks -static INLINE void highbd_masked_variance64_ssse3( - const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, - const uint8_t *m, int m_stride, int w, int h, int64_t *sum, uint64_t *sse) { - int ii, jj; - - const __m128i v_zero = _mm_setzero_si128(); - - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - - assert((w % 8) == 0); - - for (ii = 0; ii < h; ii++) { - for (jj = 0; jj < w; jj += 8) { - // Load inputs - 8 bits - const __m128i v_a_w = _mm_loadu_si128((const __m128i *)(a + jj)); - const __m128i v_b_w = _mm_loadu_si128((const __m128i *)(b + jj)); - const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)(m + jj)); - - // Unpack m to 16 bits - still containing max 8 bits - const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); - - // Difference: [-4095, 4095] - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - - // Error - [-4095, 4095] * [0, 64] => sum of 2 of these fits in 19 bits - const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); - - // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit) - const __m128i v_absd_w = _mm_abs_epi16(v_d_w); - const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero); - const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero); - const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d); - const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero); - const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero); - const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d); - // Square and sum the errors -> 36bits * 4 = 38bits - __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d; - v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d); - v_elo1_d = _mm_srli_si128(v_elo_d, 4); - v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d); - v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q); - v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d); - v_ehi3_d = _mm_srli_si128(v_ehi_d, 4); - v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d); - v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q); - v_se_q = _mm_add_epi64(v_se0_q, v_se1_q); - - // Accumulate - v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); - v_sse_q = _mm_add_epi64(v_sse_q, v_se_q); - } - - // Move on to next row - a += a_stride; - b += b_stride; - m += m_stride; - } - - // Horizontal sum - *sum = hsum_epi32_si64(v_sum_d); - *sse = hsum_epi64_si64(v_sse_q); - - // Round - *sum = (*sum >= 0) ? *sum : -*sum; - *sum = ROUND_POWER_OF_TWO(*sum, 6); - *sse = ROUND_POWER_OF_TWO(*sse, 12); -} - -// Main calculation for 4 wide blocks -static INLINE void highbd_masked_variance64_4wide_ssse3( - const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, - const uint8_t *m, int m_stride, int h, int64_t *sum, uint64_t *sse) { - int ii; - - const __m128i v_zero = _mm_setzero_si128(); - - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - - assert((h % 2) == 0); - - for (ii = 0; ii < h / 2; ii++) { - // Load 2 input rows - 8 bits - const __m128i v_a0_w = _mm_loadl_epi64((const __m128i *)a); - const __m128i v_b0_w = _mm_loadl_epi64((const __m128i *)b); - const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m); - const __m128i v_a1_w = _mm_loadl_epi64((const __m128i *)(a + a_stride)); - const __m128i v_b1_w = _mm_loadl_epi64((const __m128i *)(b + b_stride)); - const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride)); - - // Interleave 2 rows into a single register - const __m128i v_a_w = _mm_unpacklo_epi64(v_a0_w, v_a1_w); - const __m128i v_b_w = _mm_unpacklo_epi64(v_b0_w, v_b1_w); - const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b); - - // Unpack to 16 bits - still containing max 8 bits - const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); - - // Difference: [-4095, 4095] - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - - // Error - [-4095, 4095] * [0, 64] => fits in 19 bits (incld sign bit) - const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); - - // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit) - const __m128i v_absd_w = _mm_abs_epi16(v_d_w); - const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero); - const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero); - const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d); - const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero); - const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero); - const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d); - // Square and sum the errors -> 36bits * 4 = 38bits - __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d; - v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d); - v_elo1_d = _mm_srli_si128(v_elo_d, 4); - v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d); - v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q); - v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d); - v_ehi3_d = _mm_srli_si128(v_ehi_d, 4); - v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d); - v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q); - v_se_q = _mm_add_epi64(v_se0_q, v_se1_q); - - // Accumulate - v_sum_d = _mm_add_epi32(v_sum_d, v_e_d); - v_sse_q = _mm_add_epi64(v_sse_q, v_se_q); - - // Move on to next row - a += a_stride * 2; - b += b_stride * 2; - m += m_stride * 2; - } - - // Horizontal sum - *sum = hsum_epi32_si32(v_sum_d); - *sse = hsum_epi64_si64(v_sse_q); - - // Round - *sum = (*sum >= 0) ? *sum : -*sum; - *sum = ROUND_POWER_OF_TWO(*sum, 6); - *sse = ROUND_POWER_OF_TWO(*sse, 12); -} - -static INLINE unsigned int highbd_masked_variancewxh_ssse3( - const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, - const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { - uint64_t sse64; - int64_t sum64; - - if (w == 4) - highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, - h, &sum64, &sse64); - else - highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, - &sum64, &sse64); - - // Store the SSE - *sse = (uint32_t)sse64; - // Compute and return variance - return *sse - (uint32_t)((sum64 * sum64) / (w * h)); -} - -static INLINE unsigned int highbd_10_masked_variancewxh_ssse3( - const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, - const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { - uint64_t sse64; - int64_t sum64; - - if (w == 4) - highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, - h, &sum64, &sse64); - else - highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, - &sum64, &sse64); - - // Normalise - sum64 = ROUND_POWER_OF_TWO(sum64, 2); - sse64 = ROUND_POWER_OF_TWO(sse64, 4); - - // Store the SSE - *sse = (uint32_t)sse64; - // Compute and return variance - return *sse - (uint32_t)((sum64 * sum64) / (w * h)); -} - -static INLINE unsigned int highbd_12_masked_variancewxh_ssse3( - const uint16_t *a, int a_stride, const uint16_t *b, int b_stride, - const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) { - uint64_t sse64; - int64_t sum64; - - if (w == 4) - highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride, - h, &sum64, &sse64); - else - highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h, - &sum64, &sse64); - - sum64 = ROUND_POWER_OF_TWO(sum64, 4); - sse64 = ROUND_POWER_OF_TWO(sse64, 8); - - // Store the SSE - *sse = (uint32_t)sse64; - // Compute and return variance - return *sse - (uint32_t)((sum64 * sum64) / (w * h)); -} - -#define HIGHBD_MASKED_VARWXH(W, H) \ - unsigned int aom_highbd_masked_variance##W##x##H##_ssse3( \ - const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ - const uint8_t *m, int m_stride, unsigned int *sse) { \ - uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ - uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ - return highbd_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ - m_stride, W, H, sse); \ - } \ - \ - unsigned int aom_highbd_10_masked_variance##W##x##H##_ssse3( \ - const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ - const uint8_t *m, int m_stride, unsigned int *sse) { \ - uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ - uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ - return highbd_10_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ - m_stride, W, H, sse); \ - } \ - \ - unsigned int aom_highbd_12_masked_variance##W##x##H##_ssse3( \ - const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, \ - const uint8_t *m, int m_stride, unsigned int *sse) { \ - uint16_t *a = CONVERT_TO_SHORTPTR(a8); \ - uint16_t *b = CONVERT_TO_SHORTPTR(b8); \ - return highbd_12_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \ - m_stride, W, H, sse); \ - } - -HIGHBD_MASKED_VARWXH(4, 4) -HIGHBD_MASKED_VARWXH(4, 8) -HIGHBD_MASKED_VARWXH(8, 4) -HIGHBD_MASKED_VARWXH(8, 8) -HIGHBD_MASKED_VARWXH(8, 16) -HIGHBD_MASKED_VARWXH(16, 8) -HIGHBD_MASKED_VARWXH(16, 16) -HIGHBD_MASKED_VARWXH(16, 32) -HIGHBD_MASKED_VARWXH(32, 16) -HIGHBD_MASKED_VARWXH(32, 32) -HIGHBD_MASKED_VARWXH(32, 64) -HIGHBD_MASKED_VARWXH(64, 32) -HIGHBD_MASKED_VARWXH(64, 64) -#if CONFIG_EXT_PARTITION -HIGHBD_MASKED_VARWXH(64, 128) -HIGHBD_MASKED_VARWXH(128, 64) -HIGHBD_MASKED_VARWXH(128, 128) -#endif // CONFIG_EXT_PARTITION - +MASK_SUBPIX_VAR_SSSE3(128, 128) +MASK_SUBPIX_VAR_SSSE3(128, 64) +MASK_SUBPIX_VAR_SSSE3(64, 128) #endif - -////////////////////////////////////////////////////////////////////////////// -// Sub pixel versions -////////////////////////////////////////////////////////////////////////////// - -typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b, - __m128i v_filter_b); - -static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b, - const __m128i v_filter_b) { - (void)v_filter_b; - return _mm_avg_epu8(v_a_b, v_b_b); -} - -static INLINE __m128i apply_filter(const __m128i v_a_b, const __m128i v_b_b, - const __m128i v_filter_b) { - const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1)); - __m128i v_input_lo_b = _mm_unpacklo_epi8(v_a_b, v_b_b); - __m128i v_input_hi_b = _mm_unpackhi_epi8(v_a_b, v_b_b); - __m128i v_temp0_w = _mm_maddubs_epi16(v_input_lo_b, v_filter_b); - __m128i v_temp1_w = _mm_maddubs_epi16(v_input_hi_b, v_filter_b); - __m128i v_res_lo_w = - _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS); - __m128i v_res_hi_w = - _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w), FILTER_BITS); - return _mm_packus_epi16(v_res_lo_w, v_res_hi_w); -} - -// Apply the filter to the contents of the lower half of a and b -static INLINE void apply_filter_lo(const __m128i v_a_lo_b, - const __m128i v_b_lo_b, - const __m128i v_filter_b, __m128i *v_res_w) { - const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1)); - __m128i v_input_b = _mm_unpacklo_epi8(v_a_lo_b, v_b_lo_b); - __m128i v_temp0_w = _mm_maddubs_epi16(v_input_b, v_filter_b); - *v_res_w = - _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS); -} - -static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b, - const __m128i v_m_b, __m128i *v_sum_d, - __m128i *v_sse_q) { - const __m128i v_zero = _mm_setzero_si128(); - // Unpack to 16 bits - still containing max 8 bits - const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero); - const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero); - const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero); - const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero); - const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero); - const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero); - - // Difference: [-255, 255] - const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w); - const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w); - - // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits - const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w); - const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w); - const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w); - const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w); - - // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits - const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w); - const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w); - - // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits - const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d); - - // Unpack Squared error to 64 bits - const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero); - const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero); - - // Accumulate - *v_sum_d = _mm_add_epi32(*v_sum_d, v_e0_d); - *v_sum_d = _mm_add_epi32(*v_sum_d, v_e1_d); - *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_lo_q); - *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_hi_q); -} - -// Functions for width (W) >= 16 -unsigned int aom_masked_subpel_varWxH_xzero(const uint8_t *src, int src_stride, - int yoffset, const uint8_t *dst, - int dst_stride, const uint8_t *msk, - int msk_stride, unsigned int *sse, - int w, int h, - filter_fn_t filter_fn) { +MASK_SUBPIX_VAR_SSSE3(64, 64) +MASK_SUBPIX_VAR_SSSE3(64, 32) +MASK_SUBPIX_VAR_SSSE3(32, 64) +MASK_SUBPIX_VAR_SSSE3(32, 32) +MASK_SUBPIX_VAR_SSSE3(32, 16) +MASK_SUBPIX_VAR_SSSE3(16, 32) +MASK_SUBPIX_VAR_SSSE3(16, 16) +MASK_SUBPIX_VAR_SSSE3(16, 8) +MASK_SUBPIX_VAR8XH_SSSE3(16) +MASK_SUBPIX_VAR8XH_SSSE3(8) +MASK_SUBPIX_VAR8XH_SSSE3(4) +MASK_SUBPIX_VAR4XH_SSSE3(8) +MASK_SUBPIX_VAR4XH_SSSE3(4) + +static INLINE __m128i filter_block(const __m128i a, const __m128i b, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi8(a, b); + v0 = _mm_maddubs_epi16(v0, filter); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + __m128i v1 = _mm_unpackhi_epi8(a, b); + v1 = _mm_maddubs_epi16(v1, filter); + v1 = xx_roundn_epu16(v1, FILTER_BITS); + + return _mm_packus_epi16(v0, v1); +} + +static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int w, int h) { int i, j; - __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filter_b = _mm_set1_epi16( - (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]); - assert(yoffset < BIL_SUBPEL_SHIFTS); - for (j = 0; j < w; j += 16) { - // Load the first row ready - v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); - // Process 2 rows at a time - for (i = 0; i < h; i += 2) { - // Load the next row apply the filter - v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride)); - v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b); - // Load the dst and msk for the variance calculation - v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); - v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); - sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); - - // Load the next row apply the filter - v_src0_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2)); - v_res_b = filter_fn(v_src1_b, v_src0_b, v_filter_b); - // Load the dst and msk for the variance calculation - v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride)); - v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j + msk_stride)); - sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); - // Move onto the next block of rows - src += src_stride * 2; - dst += dst_stride * 2; - msk += msk_stride * 2; + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + _mm_storeu_si128((__m128i *)&b[j], x); + } + src += src_stride; + b += w; } - // Reset to the top of the block - src -= src_stride * h; - dst -= dst_stride * h; - msk -= msk_stride * h; - } - return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); -} -unsigned int aom_masked_subpel_varWxH_yzero(const uint8_t *src, int src_stride, - int xoffset, const uint8_t *dst, - int dst_stride, const uint8_t *msk, - int msk_stride, unsigned int *sse, - int w, int h, - filter_fn_t filter_fn) { - int i, j; - __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filter_b = _mm_set1_epi16( - (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]); - assert(xoffset < BIL_SUBPEL_SHIFTS); - for (i = 0; i < h; i++) { - for (j = 0; j < w; j += 16) { - // Load this row and one below & apply the filter to them - v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); - v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1)); - v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b); - - // Load the dst and msk for the variance calculation - v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); - v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); - sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); + __m128i z = _mm_alignr_epi8(y, x, 1); + _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z)); + } + src += src_stride; + b += w; } - src += src_stride; - dst += dst_stride; - msk += msk_stride; - } - return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); -} -unsigned int aom_masked_subpel_varWxH_xnonzero_ynonzero( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int w, int h, filter_fn_t xfilter_fn, - filter_fn_t yfilter_fn) { - int i, j; - __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b; - __m128i v_filtered0_b, v_filtered1_b, v_res_b, v_dst_b, v_msk_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filterx_b = _mm_set1_epi16( - (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]); - const __m128i v_filtery_b = _mm_set1_epi16( - (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]); - assert(yoffset < BIL_SUBPEL_SHIFTS); - assert(xoffset < BIL_SUBPEL_SHIFTS); - for (j = 0; j < w; j += 16) { - // Load the first row ready - v_src0_b = _mm_loadu_si128((const __m128i *)(src + j)); - v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1)); - v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b); - // Process 2 rows at a time - for (i = 0; i < h; i += 2) { - // Load the next row & apply the filter - v_src2_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j)); - v_src3_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1)); - v_filtered1_b = xfilter_fn(v_src2_b, v_src3_b, v_filterx_b); - // Load the dst and msk for the variance calculation - v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j)); - v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j)); - // Complete the calculation for this row and add it to the running total - v_res_b = yfilter_fn(v_filtered0_b, v_filtered1_b, v_filtery_b); - sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); - - // Load the next row & apply the filter - v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j)); - v_src1_b = - _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1)); - v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b); - // Load the dst and msk for the variance calculation - v_dst_b = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j)); - v_msk_b = _mm_loadu_si128((const __m128i *)(msk + msk_stride + j)); - // Complete the calculation for this row and add it to the running total - v_res_b = yfilter_fn(v_filtered1_b, v_filtered0_b, v_filtery_b); - sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); - // Move onto the next block of rows - src += src_stride * 2; - dst += dst_stride * 2; - msk += msk_stride * 2; + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]); + const __m128i z = _mm_alignr_epi8(y, x, 1); + const __m128i res = filter_block(x, z, hfilter_vec); + _mm_storeu_si128((__m128i *)&b[j], res); + } + + src += src_stride; + b += w; } - // Reset to the top of the block - src -= src_stride * h; - dst -= dst_stride * h; - msk -= msk_stride * h; } - return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h); -} -// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, -// xmm[63:32] = row 3, xmm[31:0] = row 4 -unsigned int aom_masked_subpel_var4xH_xzero(const uint8_t *src, int src_stride, - int yoffset, const uint8_t *dst, - int dst_stride, const uint8_t *msk, - int msk_stride, unsigned int *sse, - int h) { - int i; - __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered1_w, v_filtered2_w; - __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b; - __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + - bilinear_filters_2t[yoffset][0]); - assert(yoffset < BIL_SUBPEL_SHIFTS); - // Load the first row of src data ready - v_src0_b = _mm_loadl_epi64((const __m128i *)src); - for (i = 0; i < h; i += 4) { - // Load the rest of the source data for these rows - v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); - v_src1_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); - v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); - v_src3_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); - v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); - // Load the dst data - v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); - v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); - v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); - v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); - v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); - v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); - v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); - // Load the mask data - v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); - v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); - v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); - v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); - v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); - v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); - v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); - // Apply the y filter - if (yoffset == HALF_PIXEL_OFFSET) { - v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b); - v_src2_b = - _mm_or_si128(_mm_slli_si128(v_src1_b, 4), - _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0))); - v_res_b = _mm_avg_epu8(v_src1_b, v_src2_b); - } else { - v_src2_b = - _mm_or_si128(_mm_slli_si128(v_src1_b, 4), - _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0))); - apply_filter_lo(v_src1_b, v_src2_b, v_filter_b, &v_filtered1_w); - v_src2_b = - _mm_or_si128(_mm_slli_si128(v_src3_b, 4), - _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0))); - apply_filter_lo(v_src3_b, v_src2_b, v_filter_b, &v_filtered2_w); - v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered1_w); + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y)); + } + dst += w; } - // Compute the sum and SSE - sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q); - // Move onto the next set of rows - src += src_stride * 4; - dst += dst_stride * 4; - msk += msk_stride * 4; - } - return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); -} - -// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2 -unsigned int aom_masked_subpel_var8xH_xzero(const uint8_t *src, int src_stride, - int yoffset, const uint8_t *dst, - int dst_stride, const uint8_t *msk, - int msk_stride, unsigned int *sse, - int h) { - int i; - __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_res_b; - __m128i v_dst_b = _mm_setzero_si128(); - __m128i v_msk_b = _mm_setzero_si128(); - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + - bilinear_filters_2t[yoffset][0]); - assert(yoffset < BIL_SUBPEL_SHIFTS); - // Load the first row of src data ready - v_src0_b = _mm_loadl_epi64((const __m128i *)src); - for (i = 0; i < h; i += 2) { - if (yoffset == HALF_PIXEL_OFFSET) { - // Load the rest of the source data for these rows - v_src1_b = _mm_or_si128( - _mm_slli_si128(v_src0_b, 8), - _mm_loadl_epi64((const __m128i *)(src + src_stride * 1))); - v_src0_b = _mm_or_si128( - _mm_slli_si128(v_src1_b, 8), - _mm_loadl_epi64((const __m128i *)(src + src_stride * 2))); - // Apply the y filter - v_res_b = _mm_avg_epu8(v_src1_b, v_src0_b); - } else { - // Load the data and apply the y filter - v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); - apply_filter_lo(v_src0_b, v_src1_b, v_filter_b, &v_filtered0_w); - v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - apply_filter_lo(v_src1_b, v_src0_b, v_filter_b, &v_filtered1_w); - v_res_b = _mm_packus_epi16(v_filtered1_w, v_filtered0_w); + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 16) { + const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + const __m128i res = filter_block(x, y, vfilter_vec); + _mm_storeu_si128((__m128i *)&dst[j], res); + } + + dst += w; } - // Load the dst data - v_dst_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0))); - // Load the mask data - v_msk_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0))); - // Compute the sum and SSE - sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); - // Move onto the next set of rows - src += src_stride * 2; - dst += dst_stride * 2; - msk += msk_stride * 2; } - return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); } -// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, -// xmm[63:32] = row 3, xmm[31:0] = row 4 -unsigned int aom_masked_subpel_var4xH_yzero(const uint8_t *src, int src_stride, - int xoffset, const uint8_t *dst, - int dst_stride, const uint8_t *msk, - int msk_stride, unsigned int *sse, - int h) { - int i; - __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w; - __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b; - __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b; - __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + - bilinear_filters_2t[xoffset][0]); - assert(xoffset < BIL_SUBPEL_SHIFTS); - for (i = 0; i < h; i += 4) { - // Load the src data - v_src0_b = _mm_loadl_epi64((const __m128i *)src); - v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); - v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); - v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); - v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b); - v_src2_shift_b = _mm_srli_si128(v_src2_b, 1); - v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); - v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); - v_src3_shift_b = _mm_srli_si128(v_src3_b, 1); - v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b); - // Load the dst data - v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); - v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); - v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); - v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); - v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); - v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); - v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); - // Load the mask data - v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); - v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); - v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); - v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); - v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); - v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); - v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b); - v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b); - v_res_b = _mm_avg_epu8(v_src0_b, v_src0_shift_b); - } else { - apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w); - apply_filter_lo(v_src2_b, v_src2_shift_b, v_filter_b, &v_filtered2_w); - v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered0_w); - } - // Compute the sum and SSE - sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q); - // Move onto the next set of rows - src += src_stride * 4; - dst += dst_stride * 4; - msk += msk_stride * 4; - } - return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); -} +static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0, + const __m128i a1, const __m128i b1, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi8(a0, b0); + v0 = _mm_maddubs_epi16(v0, filter); + v0 = xx_roundn_epu16(v0, FILTER_BITS); -unsigned int aom_masked_subpel_var8xH_yzero(const uint8_t *src, int src_stride, - int xoffset, const uint8_t *dst, - int dst_stride, const uint8_t *msk, - int msk_stride, unsigned int *sse, - int h) { - int i; - __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w; - __m128i v_src0_shift_b, v_src1_shift_b, v_res_b, v_dst_b, v_msk_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + - bilinear_filters_2t[xoffset][0]); - assert(xoffset < BIL_SUBPEL_SHIFTS); - for (i = 0; i < h; i += 2) { - // Load the src data - v_src0_b = _mm_loadu_si128((const __m128i *)(src)); - v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride)); - v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); - v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); - v_res_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); - } else { - apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w); - apply_filter_lo(v_src1_b, v_src1_shift_b, v_filter_b, &v_filtered1_w); - v_res_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); - } - // Load the dst data - v_dst_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); - // Load the mask data - v_msk_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); - // Compute the sum and SSE - sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); - // Move onto the next set of rows - src += src_stride * 2; - dst += dst_stride * 2; - msk += msk_stride * 2; - } - return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); + __m128i v1 = _mm_unpacklo_epi8(a1, b1); + v1 = _mm_maddubs_epi16(v1, filter); + v1 = xx_roundn_epu16(v1, FILTER_BITS); + + return _mm_packus_epi16(v0, v1); } -// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2, -// xmm[63:32] = row 3, xmm[31:0] = row 4 -unsigned int aom_masked_subpel_var4xH_xnonzero_ynonzero( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h) { +static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h) { int i; - __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w; - __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b; - __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b, v_temp_b; - __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_extra_row_b, v_res_b; - __m128i v_xres_b[2]; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + - bilinear_filters_2t[xoffset][0]); - __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + - bilinear_filters_2t[yoffset][0]); - assert(xoffset < BIL_SUBPEL_SHIFTS); - assert(yoffset < BIL_SUBPEL_SHIFTS); - for (i = 0; i < h; i += 4) { - // Load the src data - v_src0_b = _mm_loadl_epi64((const __m128i *)src); - v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); - v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b); - v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); - v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b); - v_src2_shift_b = _mm_srli_si128(v_src2_b, 1); - v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); - v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b); - v_src3_shift_b = _mm_srli_si128(v_src3_b, 1); - v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b); - v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b); - v_xres_b[i == 0 ? 0 : 1] = _mm_avg_epu8(v_src0_b, v_src0_shift_b); - } else { - apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); - apply_filter_lo(v_src2_b, v_src2_shift_b, v_filterx_b, &v_filtered2_w); - v_xres_b[i == 0 ? 0 : 1] = _mm_packus_epi16(v_filtered2_w, v_filtered0_w); + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + _mm_storel_epi64((__m128i *)b, x); + src += src_stride; + b += 8; + } + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 1); + _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z)); + src += src_stride; + b += 8; } - // Move onto the next set of rows - src += src_stride * 4; - } - // Load one more row to be used in the y filter - v_src0_b = _mm_loadl_epi64((const __m128i *)src); - v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_extra_row_b = _mm_and_si128(_mm_avg_epu8(v_src0_b, v_src0_shift_b), - _mm_setr_epi32(-1, 0, 0, 0)); } else { - apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); - v_extra_row_b = - _mm_and_si128(_mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()), - _mm_setr_epi32(-1, 0, 0, 0)); - } + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h; i += 2) { + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 1); + const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); - for (i = 0; i < h; i += 4) { - if (h == 8 && i == 0) { - v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[0], 4), - _mm_srli_si128(v_xres_b[1], 12)); - } else { - v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[i == 0 ? 0 : 1], 4), - v_extra_row_b); + src += src_stride * 2; + b += 16; } - // Apply the y filter - if (yoffset == HALF_PIXEL_OFFSET) { - v_res_b = _mm_avg_epu8(v_xres_b[i == 0 ? 0 : 1], v_temp_b); - } else { - v_res_b = apply_filter(v_xres_b[i == 0 ? 0 : 1], v_temp_b, v_filtery_b); + // Handle i = h separately + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + + __m128i v0 = _mm_unpacklo_epi8(x0, z0); + v0 = _mm_maddubs_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)dst); + __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y)); + dst += 8; } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; i += 2) { + const __m128i x = _mm_loadl_epi64((__m128i *)dst); + const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]); + const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]); + const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); - // Load the dst data - v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0)); - v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1)); - v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b); - v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2)); - v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3)); - v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b); - v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b); - // Load the mask data - v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0)); - v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1)); - v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b); - v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2)); - v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3)); - v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b); - v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b); - // Compute the sum and SSE - sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q); - // Move onto the next set of rows - dst += dst_stride * 4; - msk += msk_stride * 4; + dst += 16; + } } - return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h); } -unsigned int aom_masked_subpel_var8xH_xnonzero_ynonzero( - const uint8_t *src, int src_stride, int xoffset, int yoffset, - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h) { +static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset, + int yoffset, uint8_t *dst, int h) { int i; - __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_dst_b, v_msk_b; - __m128i v_src0_shift_b, v_src1_shift_b; - __m128i v_xres0_b, v_xres1_b, v_res_b, v_temp_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) + - bilinear_filters_2t[xoffset][0]); - __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) + - bilinear_filters_2t[yoffset][0]); - assert(xoffset < BIL_SUBPEL_SHIFTS); - assert(yoffset < BIL_SUBPEL_SHIFTS); - // Load the first block of src data - v_src0_b = _mm_loadu_si128((const __m128i *)(src)); - v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride)); - v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); - v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); - v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); - } else { - apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); - apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w); - v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); - } - for (i = 0; i < h; i += 4) { - // Load the next block of src data - v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2)); - v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 3)); - v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); - v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); - v_xres1_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); - } else { - apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); - apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w); - v_xres1_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); + // Horizontal filter + if (xoffset == 0) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = xx_loadl_32((__m128i *)src); + xx_storel_32((__m128i *)b, x); + src += src_stride; + b += 4; } - // Apply the y filter to the previous block - v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres0_b, 8), - _mm_slli_si128(v_xres1_b, 8)); - if (yoffset == HALF_PIXEL_OFFSET) { - v_res_b = _mm_avg_epu8(v_xres0_b, v_temp_b); - } else { - v_res_b = apply_filter(v_xres0_b, v_temp_b, v_filtery_b); + } else if (xoffset == 4) { + uint8_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + __m128i z = _mm_srli_si128(x, 1); + xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z)); + src += src_stride; + b += 4; } - // Load the dst data - v_dst_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); - // Load the mask data - v_msk_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); - // Compute the sum and SSE - sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); - - // Load the next block of src data - v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 4)); - v_src0_shift_b = _mm_srli_si128(v_src0_b, 1); - v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 5)); - v_src1_shift_b = _mm_srli_si128(v_src1_b, 1); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b); - v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b); - v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b); - } else { - apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w); - apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w); - v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w); + } else { + uint8_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8)); + for (i = 0; i < h; i += 4) { + const __m128i x0 = _mm_loadl_epi64((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 1); + const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 1); + const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]); + const __m128i z2 = _mm_srli_si128(x2, 1); + const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]); + const __m128i z3 = _mm_srli_si128(x3, 1); + + const __m128i a0 = _mm_unpacklo_epi32(x0, x1); + const __m128i b0 = _mm_unpacklo_epi32(z0, z1); + const __m128i a1 = _mm_unpacklo_epi32(x2, x3); + const __m128i b1 = _mm_unpacklo_epi32(z2, z3); + const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 4; + b += 16; } - // Apply the y filter to the previous block - v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres1_b, 8), - _mm_slli_si128(v_xres0_b, 8)); - if (yoffset == HALF_PIXEL_OFFSET) { - v_res_b = _mm_avg_epu8(v_xres1_b, v_temp_b); - } else { - v_res_b = apply_filter(v_xres1_b, v_temp_b, v_filtery_b); + // Handle i = h separately + const __m128i x = _mm_loadl_epi64((__m128i *)src); + const __m128i z = _mm_srli_si128(x, 1); + + __m128i v0 = _mm_unpacklo_epi8(x, z); + v0 = _mm_maddubs_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu16(v0, FILTER_BITS); + + xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = xx_loadl_32((__m128i *)dst); + __m128i y = xx_loadl_32((__m128i *)&dst[4]); + xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y)); + dst += 4; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8)); + for (i = 0; i < h; i += 4) { + const __m128i a = xx_loadl_32((__m128i *)dst); + const __m128i b = xx_loadl_32((__m128i *)&dst[4]); + const __m128i c = xx_loadl_32((__m128i *)&dst[8]); + const __m128i d = xx_loadl_32((__m128i *)&dst[12]); + const __m128i e = xx_loadl_32((__m128i *)&dst[16]); + + const __m128i a0 = _mm_unpacklo_epi32(a, b); + const __m128i b0 = _mm_unpacklo_epi32(b, c); + const __m128i a1 = _mm_unpacklo_epi32(c, d); + const __m128i b1 = _mm_unpacklo_epi32(d, e); + const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); + + dst += 16; } - // Load the dst data - v_dst_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3))); - // Load the mask data - v_msk_b = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3))); - // Compute the sum and SSE - sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q); - // Move onto the next set of rows - src += src_stride * 4; - dst += dst_stride * 4; - msk += msk_stride * 4; - } - return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h); -} - -// For W >=16 -#define MASK_SUBPIX_VAR_LARGE(W, H) \ - unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - assert(W % 16 == 0); \ - if (xoffset == 0) { \ - if (yoffset == 0) \ - return aom_masked_variance##W##x##H##_ssse3( \ - src, src_stride, dst, dst_stride, msk, msk_stride, sse); \ - else if (yoffset == HALF_PIXEL_OFFSET) \ - return aom_masked_subpel_varWxH_xzero( \ - src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ - msk_stride, sse, W, H, apply_filter_avg); \ - else \ - return aom_masked_subpel_varWxH_xzero(src, src_stride, yoffset, dst, \ - dst_stride, msk, msk_stride, \ - sse, W, H, apply_filter); \ - } else if (yoffset == 0) { \ - if (xoffset == HALF_PIXEL_OFFSET) \ - return aom_masked_subpel_varWxH_yzero( \ - src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ - msk_stride, sse, W, H, apply_filter_avg); \ - else \ - return aom_masked_subpel_varWxH_yzero(src, src_stride, xoffset, dst, \ - dst_stride, msk, msk_stride, \ - sse, W, H, apply_filter); \ - } else if (xoffset == HALF_PIXEL_OFFSET) { \ - if (yoffset == HALF_PIXEL_OFFSET) \ - return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \ - dst_stride, msk, msk_stride, sse, W, H, apply_filter_avg, \ - apply_filter_avg); \ - else \ - return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \ - msk_stride, sse, W, H, apply_filter_avg, apply_filter); \ - } else { \ - if (yoffset == HALF_PIXEL_OFFSET) \ - return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ - msk_stride, sse, W, H, apply_filter, apply_filter_avg); \ - else \ - return aom_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \ - msk_stride, sse, W, H, apply_filter, apply_filter); \ - } \ - } - -// For W < 16 -#define MASK_SUBPIX_VAR_SMALL(W, H) \ - unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - assert(W == 4 || W == 8); \ - if (xoffset == 0 && yoffset == 0) \ - return aom_masked_variance##W##x##H##_ssse3( \ - src, src_stride, dst, dst_stride, msk, msk_stride, sse); \ - else if (xoffset == 0) \ - return aom_masked_subpel_var##W##xH_xzero( \ - src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H); \ - else if (yoffset == 0) \ - return aom_masked_subpel_var##W##xH_yzero( \ - src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H); \ - else \ - return aom_masked_subpel_var##W##xH_xnonzero_ynonzero( \ - src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \ - sse, H); \ } - -MASK_SUBPIX_VAR_SMALL(4, 4) -MASK_SUBPIX_VAR_SMALL(4, 8) -MASK_SUBPIX_VAR_SMALL(8, 4) -MASK_SUBPIX_VAR_SMALL(8, 8) -MASK_SUBPIX_VAR_SMALL(8, 16) -MASK_SUBPIX_VAR_LARGE(16, 8) -MASK_SUBPIX_VAR_LARGE(16, 16) -MASK_SUBPIX_VAR_LARGE(16, 32) -MASK_SUBPIX_VAR_LARGE(32, 16) -MASK_SUBPIX_VAR_LARGE(32, 32) -MASK_SUBPIX_VAR_LARGE(32, 64) -MASK_SUBPIX_VAR_LARGE(64, 32) -MASK_SUBPIX_VAR_LARGE(64, 64) -#if CONFIG_EXT_PARTITION -MASK_SUBPIX_VAR_LARGE(64, 128) -MASK_SUBPIX_VAR_LARGE(128, 64) -MASK_SUBPIX_VAR_LARGE(128, 128) -#endif // CONFIG_EXT_PARTITION - -#if CONFIG_HIGHBITDEPTH -typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q, - uint32_t *sse, int w, int h); -typedef unsigned int (*highbd_variance_fn_t)(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, - unsigned int *sse); -typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w, - __m128i v_filter_w); - -static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w, - const __m128i v_b_w, - const __m128i v_filter_w) { - (void)v_filter_w; - return _mm_avg_epu16(v_a_w, v_b_w); } -static INLINE __m128i highbd_apply_filter(const __m128i v_a_w, - const __m128i v_b_w, - const __m128i v_filter_w) { - const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1)); - __m128i v_input_lo_w = _mm_unpacklo_epi16(v_a_w, v_b_w); - __m128i v_input_hi_w = _mm_unpackhi_epi16(v_a_w, v_b_w); - __m128i v_temp0_d = _mm_madd_epi16(v_input_lo_w, v_filter_w); - __m128i v_temp1_d = _mm_madd_epi16(v_input_hi_w, v_filter_w); - __m128i v_res_lo_d = - _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS); - __m128i v_res_hi_d = - _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d), FILTER_BITS); - return _mm_packs_epi32(v_res_lo_d, v_res_hi_d); -} -// Apply the filter to the contents of the lower half of a and b -static INLINE void highbd_apply_filter_lo(const __m128i v_a_lo_w, - const __m128i v_b_lo_w, - const __m128i v_filter_w, - __m128i *v_res_d) { - const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1)); - __m128i v_input_w = _mm_unpacklo_epi16(v_a_lo_w, v_b_lo_w); - __m128i v_temp0_d = _mm_madd_epi16(v_input_w, v_filter_w); - *v_res_d = - _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS); -} +static INLINE void accumulate_block(const __m128i src, const __m128i a, + const __m128i b, const __m128i m, + __m128i *sum, __m128i *sum_sq) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i m_inv = _mm_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m128i data_l = _mm_unpacklo_epi8(a, b); + const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); + __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); + pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi8(a, b); + const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); + __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); + pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi8(src, zero); + const __m128i src_r = _mm_unpackhi_epi8(src, zero); + const __m128i diff_l = _mm_sub_epi16(pred_l, src_l); + const __m128i diff_r = _mm_sub_epi16(pred_r, src_r); + + // Update partial sums and partial sums of squares + *sum = + _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one)); + *sum_sq = + _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l), + _mm_madd_epi16(diff_r, diff_r))); +} + +static void masked_variance(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, int width, + int height, unsigned int *sse, int *sum_) { + int x, y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); + accumulate_block(src, a, b, m, &sum, &sum_sq); + } -static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w, - const __m128i v_m_b, __m128i *v_sum_d, - __m128i *v_sse_q) { - const __m128i v_zero = _mm_setzero_si128(); - const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero); - - // Difference: [-2^12, 2^12] => 13 bits (incld sign bit) - const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); - - // Error - [-4095, 4095] * [0, 64] & sum pairs => fits in 19 + 1 bits - const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w); - - // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit) - const __m128i v_absd_w = _mm_abs_epi16(v_d_w); - const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero); - const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero); - const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d); - const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero); - const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero); - const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d); - // Square and sum the errors -> 36bits * 4 = 38bits - __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d; - v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d); - v_elo1_d = _mm_srli_si128(v_elo_d, 4); - v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d); - v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q); - v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d); - v_ehi3_d = _mm_srli_si128(v_ehi_d, 4); - v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d); - v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q); - v_se_q = _mm_add_epi64(v_se0_q, v_se1_q); - - // Accumulate - *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d); - *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q); + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +static void masked_variance8xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_) { + int y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y += 2) { + __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = + _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), + _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); + accumulate_block(src, a, b, m, &sum, &sum_sq); + + src_ptr += src_stride * 2; + a_ptr += 16; + b_ptr += 16; + m_ptr += m_stride * 2; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} + +static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, const uint8_t *b_ptr, + const uint8_t *m_ptr, int m_stride, int height, + unsigned int *sse, int *sum_) { + int y; + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + + for (y = 0; y < height; y += 4) { + // Load four rows at a time + __m128i src = + _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride], + *(uint32_t *)&src_ptr[src_stride * 2], + *(uint32_t *)&src_ptr[src_stride * 3]); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = _mm_setr_epi32( + *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride], + *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]); + accumulate_block(src, a, b, m, &sum, &sum_sq); + + src_ptr += src_stride * 4; + a_ptr += 16; + b_ptr += 16; + m_ptr += m_stride * 4; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, sum); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } -static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d, - __m128i v_sse_q, - uint32_t *sse, int w, - int h) { - int64_t sum64; - uint64_t sse64; - - // Horizontal sum - sum64 = hsum_epi32_si32(v_sum_d); - sse64 = hsum_epi64_si64(v_sse_q); - - sum64 = (sum64 >= 0) ? sum64 : -sum64; - - // Round - sum64 = ROUND_POWER_OF_TWO(sum64, 6); - sse64 = ROUND_POWER_OF_TWO(sse64, 12); - - // Normalise - sum64 = ROUND_POWER_OF_TWO(sum64, 2); - sse64 = ROUND_POWER_OF_TWO(sse64, 4); - - // Store the SSE - *sse = (uint32_t)sse64; - // Compute the variance - return *sse - (uint32_t)((sum64 * sum64) / (w * h)); -} -static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d, - __m128i v_sse_q, - uint32_t *sse, int w, - int h) { - int64_t sum64; - uint64_t sse64; - - // Horizontal sum - sum64 = hsum_epi32_si64(v_sum_d); - sse64 = hsum_epi64_si64(v_sse_q); - - sum64 = (sum64 >= 0) ? sum64 : -sum64; - - // Round - sum64 = ROUND_POWER_OF_TWO(sum64, 6); - sse64 = ROUND_POWER_OF_TWO(sse64, 12); - - // Normalise - sum64 = ROUND_POWER_OF_TWO(sum64, 4); - sse64 = ROUND_POWER_OF_TWO(sse64, 8); - - // Store the SSE - *sse = (uint32_t)sse64; - // Compute the variance - return *sse - (uint32_t)((sum64 * sum64) / (w * h)); -} +#if CONFIG_HIGHBITDEPTH +// For width a multiple of 8 +static void highbd_bilinear_filter(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int w, int h); + +static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int h); + +// For width a multiple of 8 +static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, int a_stride, + const uint16_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, uint64_t *sse, + int *sum_); + +static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, + const uint16_t *b_ptr, + const uint8_t *m_ptr, int m_stride, + int height, int *sse, int *sum_); + +#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)sse64; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \ + sum = ROUND_POWER_OF_TWO(sum, 2); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } \ + unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + uint64_t sse64; \ + int sum; \ + uint16_t temp[(H + 1) * W]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + else \ + highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \ + msk_stride, W, H, &sse64, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \ + sum = ROUND_POWER_OF_TWO(sum, 4); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + } + +#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \ + unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)sse_; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + } \ + unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \ + sum = ROUND_POWER_OF_TWO(sum, 2); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + } \ + unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \ + const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \ + const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ + int sse_; \ + int sum; \ + uint16_t temp[(H + 1) * 4]; \ + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \ + \ + highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \ + \ + if (!invert_mask) \ + highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \ + msk_stride, H, &sse_, &sum); \ + else \ + highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \ + msk_stride, H, &sse_, &sum); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \ + sum = ROUND_POWER_OF_TWO(sum, 4); \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + } -// High bit depth functions for width (W) >= 8 -unsigned int aom_highbd_masked_subpel_varWxH_xzero( - const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst, - int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, - int w, int h, highbd_filter_fn_t filter_fn, - highbd_calc_masked_var_t calc_var) { +#if CONFIG_EXT_PARTITION +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128) +#endif +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8) +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4) + +static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi16(a, b); + v0 = _mm_madd_epi16(v0, filter); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + __m128i v1 = _mm_unpackhi_epi16(a, b); + v1 = _mm_madd_epi16(v1, filter); + v1 = xx_roundn_epu32(v1, FILTER_BITS); + + return _mm_packs_epi32(v0, v1); +} + +static void highbd_bilinear_filter(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int w, int h) { int i, j; - __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filter_w = - _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + - bilinear_filters_2t[yoffset][0]); - assert(yoffset < BIL_SUBPEL_SHIFTS); - for (j = 0; j < w; j += 8) { - // Load the first row ready - v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); - // Process 2 rows at a time - for (i = 0; i < h; i += 2) { - // Load the next row apply the filter - v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride)); - v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w); - // Load the dst and msk for the variance calculation - v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); - v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); - highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); - - // Load the next row apply the filter - v_src0_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2)); - v_res_w = filter_fn(v_src1_w, v_src0_w, v_filter_w); - // Load the dst and msk for the variance calculation - v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride)); - v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j + msk_stride)); - highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); - // Move onto the next block of rows - src += src_stride * 2; - dst += dst_stride * 2; - msk += msk_stride * 2; + // Horizontal filter + if (xoffset == 0) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + _mm_storeu_si128((__m128i *)&b[j], x); + } + src += src_stride; + b += w; } - // Reset to the top of the block - src -= src_stride * h; - dst -= dst_stride * h; - msk -= msk_stride * h; - } - return calc_var(v_sum_d, v_sse_q, sse, w, h); -} -unsigned int aom_highbd_masked_subpel_varWxH_yzero( - const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst, - int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, - int w, int h, highbd_filter_fn_t filter_fn, - highbd_calc_masked_var_t calc_var) { - int i, j; - __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filter_w = - _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + - bilinear_filters_2t[xoffset][0]); - assert(xoffset < BIL_SUBPEL_SHIFTS); - for (i = 0; i < h; i++) { - for (j = 0; j < w; j += 8) { - // Load this row & apply the filter to them - v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); - v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1)); - v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w); - - // Load the dst and msk for the variance calculation - v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); - v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); - highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); + } else if (xoffset == 4) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); + __m128i z = _mm_alignr_epi8(y, x, 2); + _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z)); + } + src += src_stride; + b += w; } - src += src_stride; - dst += dst_stride; - msk += msk_stride; - } - return calc_var(v_sum_d, v_sse_q, sse, w, h); -} - -unsigned int aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( - const uint16_t *src, int src_stride, int xoffset, int yoffset, - const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn, - highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) { - int i, j; - __m128i v_src0_w, v_src1_w, v_src2_w, v_src3_w; - __m128i v_filtered0_w, v_filtered1_w, v_res_w, v_dst_w, v_msk_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - const __m128i v_filterx_w = - _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + - bilinear_filters_2t[xoffset][0]); - const __m128i v_filtery_w = - _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + - bilinear_filters_2t[yoffset][0]); - assert(xoffset < BIL_SUBPEL_SHIFTS); - assert(yoffset < BIL_SUBPEL_SHIFTS); - for (j = 0; j < w; j += 8) { - // Load the first row ready - v_src0_w = _mm_loadu_si128((const __m128i *)(src + j)); - v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1)); - v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w); - // Process 2 rows at a time - for (i = 0; i < h; i += 2) { - // Load the next row & apply the filter - v_src2_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j)); - v_src3_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1)); - v_filtered1_w = xfilter_fn(v_src2_w, v_src3_w, v_filterx_w); - // Load the dst and msk for the variance calculation - v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j)); - v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j)); - // Complete the calculation for this row and add it to the running total - v_res_w = yfilter_fn(v_filtered0_w, v_filtered1_w, v_filtery_w); - highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); - - // Load the next row & apply the filter - v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j)); - v_src1_w = - _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1)); - v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w); - // Load the dst and msk for the variance calculation - v_dst_w = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j)); - v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + msk_stride + j)); - // Complete the calculation for this row and add it to the running total - v_res_w = yfilter_fn(v_filtered1_w, v_filtered0_w, v_filtery_w); - highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); - // Move onto the next block of rows - src += src_stride * 2; - dst += dst_stride * 2; - msk += msk_stride * 2; + } else { + uint16_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); + for (i = 0; i < h + 1; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i x = _mm_loadu_si128((__m128i *)&src[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]); + const __m128i z = _mm_alignr_epi8(y, x, 2); + const __m128i res = highbd_filter_block(x, z, hfilter_vec); + _mm_storeu_si128((__m128i *)&b[j], res); + } + + src += src_stride; + b += w; } - // Reset to the top of the block - src -= src_stride * h; - dst -= dst_stride * h; - msk -= msk_stride * h; } - return calc_var(v_sum_d, v_sse_q, sse, w, h); -} -// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2 -unsigned int aom_highbd_masked_subpel_var4xH_xzero( - const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst, - int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, - int h, highbd_calc_masked_var_t calc_var) { - int i; - __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_res_w; - __m128i v_dst_w, v_msk_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + - bilinear_filters_2t[yoffset][0]); - assert(yoffset < BIL_SUBPEL_SHIFTS); - // Load the first row of src data ready - v_src0_w = _mm_loadl_epi64((const __m128i *)src); - for (i = 0; i < h; i += 2) { - if (yoffset == HALF_PIXEL_OFFSET) { - // Load the rest of the source data for these rows - v_src1_w = _mm_or_si128( - _mm_slli_si128(v_src0_w, 8), - _mm_loadl_epi64((const __m128i *)(src + src_stride * 1))); - v_src0_w = _mm_or_si128( - _mm_slli_si128(v_src1_w, 8), - _mm_loadl_epi64((const __m128i *)(src + src_stride * 2))); - // Apply the y filter - v_res_w = _mm_avg_epu16(v_src1_w, v_src0_w); - } else { - // Load the data and apply the y filter - v_src1_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)); - highbd_apply_filter_lo(v_src0_w, v_src1_w, v_filter_w, &v_filtered0_d); - v_src0_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - highbd_apply_filter_lo(v_src1_w, v_src0_w, v_filter_w, &v_filtered1_d); - v_res_w = _mm_packs_epi32(v_filtered1_d, v_filtered0_d); + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y)); + } + dst += w; + } + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]); + const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]); + const __m128i res = highbd_filter_block(x, y, vfilter_vec); + _mm_storeu_si128((__m128i *)&dst[j], res); + } + + dst += w; } - // Load the dst data - v_dst_w = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0))); - // Load the mask data - v_msk_b = _mm_unpacklo_epi32( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0))); - // Compute the sum and SSE - highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); - // Move onto the next set of rows - src += src_stride * 2; - dst += dst_stride * 2; - msk += msk_stride * 2; } - return calc_var(v_sum_d, v_sse_q, sse, 4, h); } -unsigned int aom_highbd_masked_subpel_var4xH_yzero( - const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst, - int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse, - int h, highbd_calc_masked_var_t calc_var) { - int i; - __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d; - __m128i v_src0_shift_w, v_src1_shift_w, v_res_w, v_dst_w, v_msk_b; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + - bilinear_filters_2t[xoffset][0]); - assert(xoffset < BIL_SUBPEL_SHIFTS); - for (i = 0; i < h; i += 2) { - // Load the src data - v_src0_w = _mm_loadu_si128((const __m128i *)(src)); - v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); - v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride)); - v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); - v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); - v_res_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); - } else { - highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filter_w, - &v_filtered0_d); - highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filter_w, - &v_filtered1_d); - v_res_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); - } - // Load the dst data - v_dst_w = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); - // Load the mask data - v_msk_b = _mm_unpacklo_epi32( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); - // Compute the sum and SSE - highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); - // Move onto the next set of rows - src += src_stride * 2; - dst += dst_stride * 2; - msk += msk_stride * 2; - } - return calc_var(v_sum_d, v_sse_q, sse, 4, h); +static INLINE __m128i highbd_filter_block_2rows(const __m128i a0, + const __m128i b0, + const __m128i a1, + const __m128i b1, + const __m128i filter) { + __m128i v0 = _mm_unpacklo_epi16(a0, b0); + v0 = _mm_madd_epi16(v0, filter); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + __m128i v1 = _mm_unpacklo_epi16(a1, b1); + v1 = _mm_madd_epi16(v1, filter); + v1 = xx_roundn_epu32(v1, FILTER_BITS); + + return _mm_packs_epi32(v0, v1); } -unsigned int aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero( - const uint16_t *src, int src_stride, int xoffset, int yoffset, - const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, - unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) { +static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride, + int xoffset, int yoffset, uint16_t *dst, + int h) { int i; - __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_dst_w, v_msk_b; - __m128i v_src0_shift_w, v_src1_shift_w; - __m128i v_xres0_w, v_xres1_w, v_res_w, v_temp_w; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_q = _mm_setzero_si128(); - __m128i v_filterx_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) + - bilinear_filters_2t[xoffset][0]); - __m128i v_filtery_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) + - bilinear_filters_2t[yoffset][0]); - assert(xoffset < BIL_SUBPEL_SHIFTS); - assert(yoffset < BIL_SUBPEL_SHIFTS); - // Load the first block of src data - v_src0_w = _mm_loadu_si128((const __m128i *)(src)); - v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); - v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride)); - v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); - v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); - v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); - } else { - highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w, - &v_filtered0_d); - highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w, - &v_filtered1_d); - v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); - } - for (i = 0; i < h; i += 4) { - // Load the next block of src data - v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2)); - v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); - v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 3)); - v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); - v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); - v_xres1_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); - } else { - highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w, - &v_filtered0_d); - highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w, - &v_filtered1_d); - v_xres1_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); + // Horizontal filter + if (xoffset == 0) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)src); + _mm_storel_epi64((__m128i *)b, x); + src += src_stride; + b += 4; } - // Apply the y filter to the previous block - v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres0_w, 8), - _mm_slli_si128(v_xres1_w, 8)); - if (yoffset == HALF_PIXEL_OFFSET) { - v_res_w = _mm_avg_epu16(v_xres0_w, v_temp_w); - } else { - v_res_w = highbd_apply_filter(v_xres0_w, v_temp_w, v_filtery_w); + } else if (xoffset == 4) { + uint16_t *b = dst; + for (i = 0; i < h + 1; ++i) { + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 2); + _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z)); + src += src_stride; + b += 4; } - // Load the dst data - v_dst_w = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1))); - // Load the mask data - v_msk_b = _mm_unpacklo_epi32( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1))); - // Compute the sum and SSE - highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); - - // Load the next block of src data - v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 4)); - v_src0_shift_w = _mm_srli_si128(v_src0_w, 2); - v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 5)); - v_src1_shift_w = _mm_srli_si128(v_src1_w, 2); - // Apply the x filter - if (xoffset == HALF_PIXEL_OFFSET) { - v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w); - v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w); - v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w); - } else { - highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w, - &v_filtered0_d); - highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w, - &v_filtered1_d); - v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d); + } else { + uint16_t *b = dst; + const uint8_t *hfilter = bilinear_filters_2t[xoffset]; + const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16)); + for (i = 0; i < h; i += 2) { + const __m128i x0 = _mm_loadu_si128((__m128i *)src); + const __m128i z0 = _mm_srli_si128(x0, 2); + const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]); + const __m128i z1 = _mm_srli_si128(x1, 2); + const __m128i res = + highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec); + _mm_storeu_si128((__m128i *)b, res); + + src += src_stride * 2; + b += 8; } - // Apply the y filter to the previous block - v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres1_w, 8), - _mm_slli_si128(v_xres0_w, 8)); - if (yoffset == HALF_PIXEL_OFFSET) { - v_res_w = _mm_avg_epu16(v_xres1_w, v_temp_w); - } else { - v_res_w = highbd_apply_filter(v_xres1_w, v_temp_w, v_filtery_w); + // Process i = h separately + __m128i x = _mm_loadu_si128((__m128i *)src); + __m128i z = _mm_srli_si128(x, 2); + + __m128i v0 = _mm_unpacklo_epi16(x, z); + v0 = _mm_madd_epi16(v0, hfilter_vec); + v0 = xx_roundn_epu32(v0, FILTER_BITS); + + _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0)); + } + + // Vertical filter + if (yoffset == 0) { + // The data is already in 'dst', so no need to filter + } else if (yoffset == 4) { + for (i = 0; i < h; ++i) { + __m128i x = _mm_loadl_epi64((__m128i *)dst); + __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); + _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y)); + dst += 4; } - // Load the dst data - v_dst_w = _mm_unpacklo_epi64( - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)), - _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3))); - // Load the mask data - v_msk_b = _mm_unpacklo_epi32( - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)), - _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3))); - // Compute the sum and SSE - highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q); - // Move onto the next set of rows - src += src_stride * 4; - dst += dst_stride * 4; - msk += msk_stride * 4; - } - return calc_var(v_sum_d, v_sse_q, sse, 4, h); -} + } else { + const uint8_t *vfilter = bilinear_filters_2t[yoffset]; + const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16)); + for (i = 0; i < h; i += 2) { + const __m128i x = _mm_loadl_epi64((__m128i *)dst); + const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]); + const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]); + const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec); + _mm_storeu_si128((__m128i *)dst, res); -// For W >=8 -#define HIGHBD_MASK_SUBPIX_VAR_LARGE(W, H) \ - unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ - unsigned int *sse, highbd_calc_masked_var_t calc_var, \ - highbd_variance_fn_t full_variance_function) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - assert(W % 8 == 0); \ - if (xoffset == 0) { \ - if (yoffset == 0) \ - return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \ - msk_stride, sse); \ - else if (yoffset == HALF_PIXEL_OFFSET) \ - return aom_highbd_masked_subpel_varWxH_xzero( \ - src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ - msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \ - else \ - return aom_highbd_masked_subpel_varWxH_xzero( \ - src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, \ - W, H, highbd_apply_filter, calc_var); \ - } else if (yoffset == 0) { \ - if (xoffset == HALF_PIXEL_OFFSET) \ - return aom_highbd_masked_subpel_varWxH_yzero( \ - src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ - msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var); \ - else \ - return aom_highbd_masked_subpel_varWxH_yzero( \ - src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, \ - W, H, highbd_apply_filter, calc_var); \ - } else if (xoffset == HALF_PIXEL_OFFSET) { \ - if (yoffset == HALF_PIXEL_OFFSET) \ - return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, \ - dst_stride, msk, msk_stride, sse, W, H, highbd_apply_filter_avg, \ - highbd_apply_filter_avg, calc_var); \ - else \ - return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \ - msk_stride, sse, W, H, highbd_apply_filter_avg, \ - highbd_apply_filter, calc_var); \ - } else { \ - if (yoffset == HALF_PIXEL_OFFSET) \ - return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \ - msk_stride, sse, W, H, highbd_apply_filter, \ - highbd_apply_filter_avg, calc_var); \ - else \ - return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero( \ - src, src_stride, xoffset, yoffset, dst, dst_stride, msk, \ - msk_stride, sse, W, H, highbd_apply_filter, highbd_apply_filter, \ - calc_var); \ - } \ + dst += 8; + } } +} -// For W < 8 -#define HIGHBD_MASK_SUBPIX_VAR_SMALL(W, H) \ - unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ - unsigned int *sse, highbd_calc_masked_var_t calc_var, \ - highbd_variance_fn_t full_variance_function) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - assert(W == 4); \ - if (xoffset == 0 && yoffset == 0) \ - return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \ - msk_stride, sse); \ - else if (xoffset == 0) \ - return aom_highbd_masked_subpel_var4xH_xzero( \ - src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H, \ - calc_var); \ - else if (yoffset == 0) \ - return aom_highbd_masked_subpel_var4xH_yzero( \ - src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H, \ - calc_var); \ - else \ - return aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero( \ - src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \ - sse, H, calc_var); \ - } +static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, int a_stride, + const uint16_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int width, int height, uint64_t *sse, + int *sum_) { + int x, y; + // Note on bit widths: + // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26, + // so this can be kept as four 32-bit values. + // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38, + // so this must be stored as two 64-bit values. + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 8) { + const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); + const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); + const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); + const __m128i m = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + // Calculate 8 predicted pixels. + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi16(src, zero); + const __m128i src_r = _mm_unpackhi_epi16(src, zero); + __m128i diff_l = _mm_sub_epi32(pred_l, src_l); + __m128i diff_r = _mm_sub_epi32(pred_r, src_r); + + // Update partial sums and partial sums of squares + sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); + // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit + // field, but the range of values is only [-(2^12 - 1), 2^12 - 1]. + // So we can re-pack into 16-bit fields and use _mm_madd_epi16 + // to calculate the squares and partially sum them. + const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); + const __m128i prod = _mm_madd_epi16(tmp, tmp); + // Then we want to sign-extend to 64 bits and accumulate + const __m128i sign = _mm_srai_epi32(prod, 31); + const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign); + const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign); + sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1)); + } -#define HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(W, H) \ - unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ - src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ - sse, calc_masked_variance, \ - aom_highbd_masked_variance##W##x##H##_ssse3); \ - } \ - unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ - src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ - sse, highbd_10_calc_masked_variance, \ - aom_highbd_10_masked_variance##W##x##H##_ssse3); \ - } \ - unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ - const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \ - unsigned int *sse) { \ - return highbd_masked_sub_pixel_variance##W##x##H##_ssse3( \ - src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \ - sse, highbd_12_calc_masked_variance, \ - aom_highbd_12_masked_variance##W##x##H##_ssse3); \ - } + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, zero); + sum = _mm_hadd_epi32(sum, zero); + *sum_ = _mm_cvtsi128_si32(sum); + sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8)); + _mm_storel_epi64((__m128i *)sse, sum_sq); +} + +static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, + const uint16_t *a_ptr, + const uint16_t *b_ptr, + const uint8_t *m_ptr, int m_stride, + int height, int *sse, int *sum_) { + int y; + // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions). + // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18 + // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30. + // So we can safely pack sum_sq into 32-bit fields, which is slightly more + // convenient. + __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128(); + const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i zero = _mm_setzero_si128(); + + for (y = 0; y < height; y += 2) { + __m128i src = _mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)src_ptr), + _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); + const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr); + const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr); + const __m128i m = _mm_unpacklo_epi8( + _mm_unpacklo_epi32( + _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), + _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), + zero); + const __m128i m_inv = _mm_sub_epi16(mask_max, m); + + const __m128i data_l = _mm_unpacklo_epi16(a, b); + const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); + __m128i pred_l = _mm_madd_epi16(data_l, mask_l); + pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i data_r = _mm_unpackhi_epi16(a, b); + const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); + __m128i pred_r = _mm_madd_epi16(data_r, mask_r); + pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i src_l = _mm_unpacklo_epi16(src, zero); + const __m128i src_r = _mm_unpackhi_epi16(src, zero); + __m128i diff_l = _mm_sub_epi32(pred_l, src_l); + __m128i diff_r = _mm_sub_epi32(pred_r, src_r); + + // Update partial sums and partial sums of squares + sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r)); + const __m128i tmp = _mm_packs_epi32(diff_l, diff_r); + const __m128i prod = _mm_madd_epi16(tmp, tmp); + sum_sq = _mm_add_epi32(sum_sq, prod); + + src_ptr += src_stride * 2; + a_ptr += 8; + b_ptr += 8; + m_ptr += m_stride * 2; + } + // Reduce down to a single sum and sum of squares + sum = _mm_hadd_epi32(sum, sum_sq); + sum = _mm_hadd_epi32(sum, zero); + *sum_ = _mm_cvtsi128_si32(sum); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); +} -HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 4) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 4) -HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 8) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 8) -HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 4) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 4) -HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 8) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 8) -HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 16) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 16) -HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 8) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 8) -HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 16) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 16) -HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 32) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 32) -HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 16) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 16) -HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 32) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 32) -HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 64) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 64) -HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 32) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 32) -HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 64) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 64) -#if CONFIG_EXT_PARTITION -HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 128) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 128) -HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 64) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 64) -HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 128) -HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 128) -#endif // CONFIG_EXT_PARTITION #endif diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h new file mode 100644 index 000000000..73589a32a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +#define AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ + +#include + +#include "./aom_config.h" + +static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) { + v_d = _mm_hadd_epi32(v_d, v_d); + v_d = _mm_hadd_epi32(v_d, v_d); + return _mm_cvtsi128_si32(v_d); +} + +static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) { + v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); +#if ARCH_X86_64 + return _mm_cvtsi128_si64(v_q); +#else + { + int64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_q); + return tmp; + } +#endif +} + +static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { + const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); + const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); + const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); + return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); +} + +#endif // AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c index ad77f974c..21632644f 100644 --- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c @@ -17,6 +17,7 @@ #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" #include "aom_dsp/x86/synonyms.h" //////////////////////////////////////////////////////////////////////////////// diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index efb3659cf..1797ded80 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -17,8 +17,9 @@ #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/x86/synonyms.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" //////////////////////////////////////////////////////////////////////////////// // 8 bit diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index bef606dae..cd049a454 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -89,32 +89,4 @@ static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { return _mm_srai_epi32(v_tmp_d, bits); } -#ifdef __SSSE3__ -static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) { - v_d = _mm_hadd_epi32(v_d, v_d); - v_d = _mm_hadd_epi32(v_d, v_d); - return _mm_cvtsi128_si32(v_d); -} - -static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) { - v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8)); -#if ARCH_X86_64 - return _mm_cvtsi128_si64(v_q); -#else - { - int64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, v_q); - return tmp; - } -#endif -} - -static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { - const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128()); - const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d); - const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d); - return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); -} -#endif // __SSSE3__ - #endif // AOM_DSP_X86_SYNONYMS_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h index 39e9b8e2a..4f7a60c22 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -34,7 +34,8 @@ static INLINE void mm256_reverse_epi16(__m256i *u) { *u = _mm256_permute2x128_si256(v, v, 1); } -static INLINE void mm256_transpose_16x16(__m256i *in) { +// Note: in and out could have the same value +static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); @@ -143,29 +144,30 @@ static INLINE void mm256_transpose_16x16(__m256i *in) { // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff - in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 - in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 - in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); - in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); - in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); - in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); - in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); - in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); - - in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); - in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); - in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); - in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); - in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); - in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); - in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); - in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); + out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 + out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 + out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); + out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); + out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); + out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); + out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); + out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); + + out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); + out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); + out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); + out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); + out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); + out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); + out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); + out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); } -static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) { +static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1, + const __m256i *cospi) { const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i y0 = _mm256_madd_epi16(a0, cospi); - __m256i y1 = _mm256_madd_epi16(a1, cospi); + __m256i y0 = _mm256_madd_epi16(*a0, *cospi); + __m256i y1 = _mm256_madd_epi16(*a1, *cospi); y0 = _mm256_add_epi32(y0, dct_rounding); y1 = _mm256_add_epi32(y1, dct_rounding); -- cgit v1.2.3 From 7369c7d7a5eed32963d8af37658286617919f91c Mon Sep 17 00:00:00 2001 From: trav90 Date: Thu, 18 Oct 2018 06:04:57 -0500 Subject: Update aom to commit id f5bdeac22930ff4c6b219be49c843db35970b918 --- third_party/aom/aom_dsp/aom_convolve.c | 227 ++++ third_party/aom/aom_dsp/aom_dsp.cmake | 56 +- third_party/aom/aom_dsp/aom_dsp.mk | 5 +- third_party/aom/aom_dsp/aom_dsp_common.h | 17 +- third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl | 149 +-- third_party/aom/aom_dsp/aom_filter.h | 6 + third_party/aom/aom_dsp/bitreader.h | 2 - third_party/aom/aom_dsp/bitwriter.h | 2 - third_party/aom/aom_dsp/blend_a64_mask.c | 63 ++ third_party/aom/aom_dsp/fwd_txfm.c | 42 - third_party/aom/aom_dsp/fwd_txfm.h | 5 - third_party/aom/aom_dsp/intrapred.c | 569 ++++++----- third_party/aom/aom_dsp/inv_txfm.c | 1080 +++----------------- third_party/aom/aom_dsp/inv_txfm.h | 19 +- third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c | 20 - third_party/aom/aom_dsp/mips/fwd_txfm_msa.c | 8 - third_party/aom/aom_dsp/prob.c | 4 +- third_party/aom/aom_dsp/prob.h | 2 - third_party/aom/aom_dsp/quantize.c | 13 +- third_party/aom/aom_dsp/sad.c | 58 +- .../aom/aom_dsp/simd/v256_intrinsics_v128.h | 22 +- third_party/aom/aom_dsp/txfm_common.h | 79 ++ third_party/aom/aom_dsp/variance.c | 218 +++- third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c | 141 --- .../aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c | 171 ++++ .../aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c | 2 - third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm | 6 + third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm | 13 +- .../x86/highbd_subpel_variance_impl_sse2.asm | 2 +- third_party/aom/aom_dsp/x86/highbd_variance_sse2.c | 270 ++--- third_party/aom/aom_dsp/x86/inv_txfm_sse2.c | 234 ----- .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.c | 12 + .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.c | 28 +- third_party/aom/aom_dsp/x86/obmc_sad_sse4.c | 12 + third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | 12 + third_party/aom/aom_dsp/x86/sad4d_sse2.asm | 6 + third_party/aom/aom_dsp/x86/sad_highbd_avx2.c | 5 + third_party/aom/aom_dsp/x86/sad_sse2.asm | 16 + third_party/aom/aom_dsp/x86/variance_sse2.c | 469 ++++----- 39 files changed, 1850 insertions(+), 2215 deletions(-) create mode 100644 third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c (limited to 'third_party/aom/aom_dsp') diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c index 4dac6aacc..c903ea52d 100644 --- a/third_party/aom/aom_dsp/aom_convolve.c +++ b/third_party/aom/aom_dsp/aom_convolve.c @@ -41,6 +41,29 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, } } +static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_qn, + int x_step_qn, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_qn = x0_qn; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS]; // q8 + const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(x_filter_idx < SUBPEL_SHIFTS); + const int16_t *const x_filter = x_filters[x_filter_idx]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + x_qn += x_step_qn; + } + src += src_stride; + dst += dst_stride; + } +} + static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, @@ -63,6 +86,30 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, } } +static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_qn, + int x_step_qn, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { + int x_qn = x0_qn; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS]; + const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; + assert(x_filter_idx < SUBPEL_SHIFTS); + const int16_t *const x_filter = x_filters[x_filter_idx]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + x_qn += x_step_qn; + } + src += src_stride; + dst += dst_stride; + } +} + static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, @@ -86,6 +133,31 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, } } +static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_qn, + int y_step_qn, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_qn = y0_qn; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = + &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = + y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + y_qn += y_step_qn; + } + ++src; + ++dst; + } +} + static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, @@ -112,6 +184,34 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, } } +static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *y_filters, int y0_qn, + int y_step_qn, int w, int h) { + int x, y; + src -= src_stride * (SUBPEL_TAPS / 2 - 1); + + for (x = 0; x < w; ++x) { + int y_qn = y0_qn; + for (y = 0; y < h; ++y) { + const unsigned char *src_y = + &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; + const int16_t *const y_filter = + y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) + sum += src_y[k * src_stride] * y_filter[k]; + dst[y * dst_stride] = ROUND_POWER_OF_TWO( + dst[y * dst_stride] + + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), + 1); + y_qn += y_step_qn; + } + ++src; + ++dst; + } +} + static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4, int x_step_q4, @@ -146,6 +246,41 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h); } +static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *const x_filters, int x0_qn, + int x_step_qn, const InterpKernel *const y_filters, + int y0_qn, int y_step_qn, int w, int h) { + // TODO(afergs): Update comment here + // Note: Fixed size intermediate buffer, temp, places limits on parameters. + // 2d filtering proceeds in 2 steps: + // (1) Interpolate horizontally into an intermediate buffer, temp. + // (2) Interpolate temp vertically to derive the sub-pixel result. + // Deriving the maximum number of rows in the temp buffer (135): + // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative). + // --Largest block size is 64x64 pixels. + // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the + // original frame (in 1/16th pixel units). + // --Must round-up because block may be located at sub-pixel position. + // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. + // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. + uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; + int intermediate_height = + (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS; + + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + assert(y_step_qn <= SCALE_SUBPEL_BITS * 2); + assert(x_step_qn <= SCALE_SUBPEL_BITS * 2); + + convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, + temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w, + intermediate_height); + convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, + dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h); +} + static const InterpKernel *get_filter_base(const int16_t *filter) { // NOTE: This assumes that the filter table is 256-byte aligned. // TODO(agrange) Modify to make independent of table alignment. @@ -171,6 +306,21 @@ void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, w, h); } +void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int subpel_x, + int x_step_qn, const int16_t *filter_y, + int subpel_y, int y_step_qn, int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + + (void)subpel_y; + (void)filter_y; + (void)y_step_qn; + + convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x, + x_step_qn, w, h); +} + void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -186,6 +336,22 @@ void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, x_step_q4, w, h); } +void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int subpel_x, + int x_step_qn, const int16_t *filter_y, + int subpel_y, int y_step_qn, int w, + int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + + (void)subpel_y; + (void)filter_y; + (void)y_step_qn; + + convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, + subpel_x, x_step_qn, w, h); +} + void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -201,6 +367,21 @@ void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, w, h); } +void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int subpel_x, + int x_step_qn, const int16_t *filter_y, + int subpel_y, int y_step_qn, int w, int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + + (void)subpel_x; + (void)filter_x; + (void)x_step_qn; + + convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y, + y_step_qn, w, h); +} + void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -216,6 +397,21 @@ void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, y_step_q4, w, h); } +void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int subpel_x, + int x_step_qn, const int16_t *filter_y, + int subpel_y, int y_step_qn, int w, int h) { + const InterpKernel *const filters_y = get_filter_base(filter_y); + + (void)subpel_x; + (void)filter_x; + (void)x_step_qn; + + convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, + subpel_y, y_step_qn, w, h); +} + void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -230,6 +426,19 @@ void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, filters_y, y0_q4, y_step_q4, w, h); } +void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int subpel_x, int x_step_qn, + const int16_t *filter_y, int subpel_y, int y_step_qn, + int w, int h) { + const InterpKernel *const filters_x = get_filter_base(filter_x); + + const InterpKernel *const filters_y = get_filter_base(filter_y); + + convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x, + x_step_qn, filters_y, subpel_y, y_step_qn, w, h); +} + void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, @@ -245,6 +454,22 @@ void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, h); } +void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int subpel_x, + int x_step_qn, const int16_t *filter_y, + int subpel_y, int y_step_qn, int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]); + assert(w <= MAX_SB_SIZE); + assert(h <= MAX_SB_SIZE); + + aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x, + x_step_qn, filter_y, subpel_y, y_step_qn, w, h); + aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w, + h); +} + void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, @@ -332,6 +557,7 @@ void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, filter_y, y_step_q4, w, h); } +// TODO(afergs): Make sure this works too #if CONFIG_LOOP_RESTORATION static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -569,6 +795,7 @@ void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride, } #endif // CONFIG_LOOP_RESTORATION +// TODO(afergs): Make sure this works too #if CONFIG_HIGHBITDEPTH static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index 5a49ae817..3ce6761ca 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -287,6 +287,7 @@ if (CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_avx2.h" "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.c" "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" @@ -312,6 +313,7 @@ if (CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c" "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c") @@ -330,10 +332,16 @@ if (CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/mips/variance_msa.c" "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c") + if (CONFIG_EXT_INTER) + set(AOM_DSP_ENCODER_INTRIN_SSSE3 + ${AOM_DSP_ENCODER_INTRIN_SSSE3} + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c") + endif () + if (CONFIG_HIGHBITDEPTH) set(AOM_DSP_ENCODER_INTRIN_SSE2 ${AOM_DSP_ENCODER_INTRIN_SSE2} - "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c") endif () endif () @@ -407,29 +415,38 @@ endif () # has been created. function (setup_aom_dsp_targets) add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES}) - set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_common) - target_sources(aom PUBLIC $) + list(APPEND AOM_LIB_TARGETS aom_dsp_common) + create_dummy_source_file("aom_av1" "c" "dummy_source_file") + add_library(aom_dsp OBJECT "${dummy_source_file}") + target_sources(aom PRIVATE $) + list(APPEND AOM_LIB_TARGETS aom_dsp) + + # Not all generators support libraries consisting only of object files. Add a + # dummy source file to the aom_dsp target. + add_dummy_source_file_to_target("aom_dsp" "c") if (CONFIG_AV1_DECODER) add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_decoder) - target_sources(aom PUBLIC $) + target_sources(aom PRIVATE $) endif () if (CONFIG_AV1_ENCODER) add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES}) set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_encoder) - target_sources(aom PUBLIC $) + target_sources(aom PRIVATE $) endif () if (HAVE_SSE2) add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom") add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_SSE2") + "AOM_DSP_COMMON_INTRIN_SSE2" "aom") + if (CONFIG_AV1_ENCODER) - add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom") + add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" + "aom") add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_SSE2") + "AOM_DSP_ENCODER_INTRIN_SSE2" "aom") endif() endif () @@ -440,7 +457,7 @@ function (setup_aom_dsp_targets) if (HAVE_SSSE3) add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom") add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_SSSE3") + "AOM_DSP_COMMON_INTRIN_SSSE3" "aom") if (CONFIG_AV1_ENCODER) if ("${AOM_TARGET_CPU}" STREQUAL "x86_64") @@ -448,16 +465,20 @@ function (setup_aom_dsp_targets) ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}) endif () add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom") + if (AOM_DSP_ENCODER_INTRIN_SSSE3) + add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom") + endif () endif () endif () if (HAVE_SSE4_1) add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_SSE4_1") + "AOM_DSP_COMMON_INTRIN_SSE4_1" "aom") if (CONFIG_AV1_ENCODER) if (AOM_DSP_ENCODER_INTRIN_SSE4_1) add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_SSE4_1") + "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom") endif () add_asm_library("aom_dsp_encoder_sse4_1" "AOM_DSP_ENCODER_ASM_SSE4_1" "aom") @@ -473,10 +494,10 @@ function (setup_aom_dsp_targets) if (HAVE_AVX2) add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_AVX2") + "AOM_DSP_COMMON_INTRIN_AVX2" "aom") if (CONFIG_AV1_ENCODER) add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_AVX2") + "AOM_DSP_ENCODER_INTRIN_AVX2" "aom") endif () endif () @@ -490,20 +511,21 @@ function (setup_aom_dsp_targets) if (HAVE_NEON) add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" - "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON") + "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON" + "aom") endif () if (HAVE_DSPR2) add_intrinsics_object_library("" "dspr2" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_DSPR2") + "AOM_DSP_COMMON_INTRIN_DSPR2" "aom") endif () if (HAVE_MSA) add_intrinsics_object_library("" "msa" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_MSA") + "AOM_DSP_COMMON_INTRIN_MSA" "aom") if (CONFIG_AV1_ENCODER) add_intrinsics_object_library("" "msa" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_MSA") + "AOM_DSP_ENCODER_INTRIN_MSA" "aom") endif () endif () diff --git a/third_party/aom/aom_dsp/aom_dsp.mk b/third_party/aom/aom_dsp/aom_dsp.mk index 6e2d5630e..f9d675ac0 100644 --- a/third_party/aom/aom_dsp/aom_dsp.mk +++ b/third_party/aom/aom_dsp/aom_dsp.mk @@ -290,9 +290,10 @@ DSP_SRCS-yes += quantize.c DSP_SRCS-yes += quantize.h DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c -ifeq ($(CONFIG_HIGHBITDEPTH),yes) + DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c -endif +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c + ifeq ($(ARCH_X86_64),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h index 82f9a95e9..5b104321b 100644 --- a/third_party/aom/aom_dsp/aom_dsp_common.h +++ b/third_party/aom/aom_dsp/aom_dsp_common.h @@ -31,8 +31,6 @@ extern "C" { #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y)) #define AOMMAX(x, y) (((x) > (y)) ? (x) : (y)) -#define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0])) - #define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b') #define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0) @@ -54,16 +52,9 @@ extern "C" { #define UNLIKELY(v) (v) #endif -#define AOM_SWAP(type, a, b) \ - do { \ - type c = (b); \ - b = a; \ - a = c; \ - } while (0) - #if CONFIG_AOM_QM typedef uint16_t qm_val_t; -#define AOM_QM_BITS 6 +#define AOM_QM_BITS 5 #endif #if CONFIG_HIGHBITDEPTH // Note: @@ -87,11 +78,14 @@ static INLINE int clamp(int value, int low, int high) { return value < low ? low : (value > high ? high : value); } +static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) { + return value < low ? low : (value > high ? high : value); +} + static INLINE double fclamp(double value, double low, double high) { return value < low ? low : (value > high ? high : value); } -#if CONFIG_HIGHBITDEPTH static INLINE uint16_t clip_pixel_highbd(int val, int bd) { switch (bd) { case 8: @@ -100,7 +94,6 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) { case 12: return (uint16_t)clamp(val, 0, 4095); } } -#endif // CONFIG_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index 8047cbc09..0c0356870 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -40,12 +40,26 @@ foreach $w (@block_widths) { push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ; } } +if (aom_config("CONFIG_EXT_PARTITION_TYPES")) { + push @block_sizes, [4, 16]; + push @block_sizes, [16, 4]; + push @block_sizes, [8, 32]; + push @block_sizes, [32, 8]; +} @tx_dims = (2, 4, 8, 16, 32); if (aom_config("CONFIG_TX64X64") eq "yes") { push @tx_dims, '64'; } +@tx_sizes = (); +foreach $w (@tx_dims) { + push @tx_sizes, [$w, $w]; + foreach $h (@tx_dims) { + push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w)); + } +} + @pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153/; if (aom_config("CONFIG_ALT_INTRA") eq "yes") { push @pred_names, qw/paeth smooth/; @@ -60,9 +74,8 @@ if (aom_config("CONFIG_ALT_INTRA") eq "yes") { # Intra prediction # -foreach $dim (@tx_dims) { - $w = ${dim}; - $h = ${dim}; +foreach (@tx_sizes) { + ($w, $h) = @$_; foreach $pred_name (@pred_names) { add_proto "void", "aom_${pred_name}_predictor_${w}x${h}", "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; @@ -142,20 +155,27 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { # # Sub Pixel Filters # -add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; + +add_proto qw/void aom_convolve8_horiz_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_vert_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_avg_horiz_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_avg_vert_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; +add_proto qw/void aom_convolve8_avg_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; specialize qw/aom_convolve_copy sse2 /; specialize qw/aom_convolve_avg sse2 /; @@ -334,24 +354,15 @@ if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64"; - add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct8x8_1 sse2/; - add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct16x16 sse2/; - add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct16x16_1 sse2 avx2/; - add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct32x32 sse2 avx2/; add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct32x32_rd sse2 avx2/; - add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct32x32_1 sse2 avx2/; - # High bit depth add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_highbd_fdct4x4 sse2/; @@ -359,20 +370,15 @@ if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_highbd_fdct8x8 sse2/; - add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_highbd_fdct16x16 sse2/; - add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_highbd_fdct32x32 sse2/; add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_highbd_fdct32x32_rd sse2/; - add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; } else { add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct4x4 sse2 msa/; @@ -383,47 +389,25 @@ if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64"; - add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct8x8_1 sse2 neon msa/; - add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct16x16 sse2 msa/; - add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct16x16_1 sse2 avx2 msa/; - add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct32x32 sse2 avx2 msa/; add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct32x32_rd sse2 avx2 msa/; - - add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct32x32_1 sse2 avx2 msa/; } # CONFIG_HIGHBITDEPTH } # CONFIG_AV1_ENCODER # # Inverse transform if (aom_config("CONFIG_AV1") eq "yes") { -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/aom_iwht4x4_16_add sse2/; - add_proto qw/void aom_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - add_proto qw/void aom_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void aom_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; @@ -469,10 +453,8 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/aom_idct32x32_1_add sse2 avx2/; - - add_proto qw/void aom_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/aom_highbd_idct4x4_16_add sse2/; } +if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { } else { { add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; @@ -541,13 +523,12 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") { add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; + add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; + add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; + + add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - } # CONFIG_HIGHBITDEPTH } # CONFIG_AV1_ENCODER } else { if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { @@ -559,21 +540,23 @@ if (aom_config("CONFIG_AOM_QM") eq "yes") { add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_highbd_quantize_b sse2/; + add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b sse2 avx2/; - add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_highbd_quantize_b_32x32 sse2/; + add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_32x32 sse2/; + + add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - } # CONFIG_HIGHBITDEPTH } # CONFIG_AV1_ENCODER } # CONFIG_AOM_QM if (aom_config("CONFIG_AV1") eq "yes") { # # Alpha blending with mask # + if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") { + add_proto qw/void aom_blend_a64_d32_mask/, "int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx"; + } add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx"; add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w"; add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w"; @@ -927,15 +910,15 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { # # ... # -add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride"; +add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; specialize qw/aom_upsampled_pred sse2/; -add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; +add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; specialize qw/aom_comp_avg_upsampled_pred sse2/; if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, const uint8_t *ref8, int ref_stride"; + add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; specialize qw/aom_highbd_upsampled_pred sse2/; - add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; } @@ -1005,6 +988,22 @@ specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; + +if (aom_config("CONFIG_EXT_PARTITION_TYPES")) { + specialize qw/aom_variance4x16 sse2/; + specialize qw/aom_variance16x4 sse2/; + specialize qw/aom_variance8x32 sse2/; + specialize qw/aom_variance32x8 sse2/; + specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/; +} + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { foreach $bd (8, 10, 12) { add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; @@ -1021,6 +1020,8 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { if ($w != 128 && $h != 128 && $w != 4 && $h != 4) { specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2"; } + # TODO(david.barker): When ext-partition-types is enabled, we currenly + # don't have vectorized 4x16 highbd variance functions if ($w == 4 && $h == 4) { specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1"; } @@ -1496,10 +1497,10 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { if (aom_config("CONFIG_EXT_INTER") eq "yes") { add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd"; } } diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h index 04d113dd3..58e8bb284 100644 --- a/third_party/aom/aom_dsp/aom_filter.h +++ b/third_party/aom/aom_dsp/aom_filter.h @@ -25,6 +25,12 @@ extern "C" { #define SUBPEL_SHIFTS (1 << SUBPEL_BITS) #define SUBPEL_TAPS 8 +#define SCALE_SUBPEL_BITS 10 +#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS) +#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1) +#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS) +#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2) + typedef int16_t InterpKernel[SUBPEL_TAPS]; #define BIL_SUBPEL_BITS 3 diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h index 5bad70cb3..88bedccc2 100644 --- a/third_party/aom/aom_dsp/bitreader.h +++ b/third_party/aom/aom_dsp/bitreader.h @@ -194,9 +194,7 @@ static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, int nsymbs ACCT_STR_PARAM) { int ret; ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); -#if CONFIG_EC_ADAPT update_cdf(cdf, ret, nsymbs); -#endif return ret; } diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h index 588e47bf3..68bc1c8f8 100644 --- a/third_party/aom/aom_dsp/bitwriter.h +++ b/third_party/aom/aom_dsp/bitwriter.h @@ -140,9 +140,7 @@ static INLINE void aom_write_cdf(aom_writer *w, int symb, static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, int nsymbs) { aom_write_cdf(w, symb, cdf, nsymbs); -#if CONFIG_EC_ADAPT update_cdf(cdf, symb, nsymbs); -#endif } static INLINE void aom_write_tree_as_cdf(aom_writer *w, diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c index 3e15542c9..c35fa19f8 100644 --- a/third_party/aom/aom_dsp/blend_a64_mask.c +++ b/third_party/aom/aom_dsp/blend_a64_mask.c @@ -18,6 +18,69 @@ #include "./aom_dsp_rtcd.h" +#if CONFIG_CONVOLVE_ROUND +// Blending with alpha mask. Mask values come from the range [0, 64], +// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can +// be the same as dst, or dst can be different from both sources. + +void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride, + const int32_t *src0, uint32_t src0_stride, + const int32_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + int w, int subh, int subw) { + int i, j; + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (subw == 0 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = mask[i * mask_stride + j]; + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 1) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = ROUND_POWER_OF_TWO( + mask[(2 * i) * mask_stride + (2 * j)] + + mask[(2 * i + 1) * mask_stride + (2 * j)] + + mask[(2 * i) * mask_stride + (2 * j + 1)] + + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], + 2); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else if (subw == 1 && subh == 0) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], + mask[i * mask_stride + (2 * j + 1)]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } else { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], + mask[(2 * i + 1) * mask_stride + j]); + dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], + src1[i * src1_stride + j]); + } + } + } +} +#endif // CONFIG_CONVOLVE_ROUND + // Blending with alpha mask. Mask values come from the range [0, 64], // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can // be the same as dst, or dst can be different from both sources. diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c index 12ee02ba1..1ceef7782 100644 --- a/third_party/aom/aom_dsp/fwd_txfm.c +++ b/third_party/aom/aom_dsp/fwd_txfm.c @@ -172,15 +172,6 @@ void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { } } -void aom_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 8; ++r) - for (c = 0; c < 8; ++c) sum += input[r * stride + c]; - - output[0] = sum; -} - void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { // The 2D transform is done with two passes which are actually pretty // similar. In the first one, we transform the columns and transpose @@ -361,15 +352,6 @@ void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { } } -void aom_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - int sum = 0; - for (r = 0; r < 16; ++r) - for (c = 0; c < 16; ++c) sum += input[r * stride + c]; - - output[0] = (tran_low_t)(sum >> 1); -} - static INLINE tran_high_t dct_32_round(tran_high_t input) { tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); // TODO(debargha, peter.derivaz): Find new bounds for this assert, @@ -758,15 +740,6 @@ void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { } } -void aom_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - int sum = 0; - for (r = 0; r < 32; ++r) - for (c = 0; c < 32; ++c) sum += input[r * stride + c]; - - output[0] = (tran_low_t)(sum >> 3); -} - #if CONFIG_HIGHBITDEPTH void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { @@ -778,32 +751,17 @@ void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, aom_fdct8x8_c(input, final_output, stride); } -void aom_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output, - int stride) { - aom_fdct8x8_1_c(input, final_output, stride); -} - void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { aom_fdct16x16_c(input, output, stride); } -void aom_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, - int stride) { - aom_fdct16x16_1_c(input, output, stride); -} - void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { aom_fdct32x32_c(input, out, stride); } - void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { aom_fdct32x32_rd_c(input, out, stride); } -void aom_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out, - int stride) { - aom_fdct32x32_1_c(input, out, stride); -} #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/fwd_txfm.h b/third_party/aom/aom_dsp/fwd_txfm.h index 579dbd06e..f4dc04ab4 100644 --- a/third_party/aom/aom_dsp/fwd_txfm.h +++ b/third_party/aom/aom_dsp/fwd_txfm.h @@ -20,10 +20,5 @@ static INLINE tran_high_t saturate_int16(tran_high_t value) { return result < INT16_MIN ? INT16_MIN : result; } -static INLINE tran_high_t fdct_round_shift(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return rv; -} - void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round); #endif // AOM_DSP_FWD_TXFM_H_ diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c index 370d0374b..b4d47ae89 100644 --- a/third_party/aom/aom_dsp/intrapred.c +++ b/third_party/aom/aom_dsp/intrapred.c @@ -23,13 +23,14 @@ #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) #define AVG2(a, b) (((a) + (b) + 1) >> 1) -static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { +static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { int r, c; (void)above; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { + for (r = 0; r < bh; ++r) { + for (c = 0; c < bw; ++c) { dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], left[(c >> 1) + r + 2]) : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); @@ -38,12 +39,13 @@ static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } } -static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { +static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { int r, c; (void)left; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { + for (r = 0; r < bh; ++r) { + for (c = 0; c < bw; ++c) { dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], above[(r >> 1) + c + 2]) : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); @@ -52,46 +54,49 @@ static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } } -static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { +static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { int r, c; (void)left; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { + for (r = 0; r < bh; ++r) { + for (c = 0; c < bw; ++c) { dst[c] = AVG3(above[r + c], above[r + c + 1], - above[r + c + 1 + (r + c + 2 < bs * 2)]); + above[r + c + 1 + (r + c + 2 < bw + bh)]); } dst += stride; } } -static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { +static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { int r, c; // first row - for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]); + for (c = 0; c < bw; c++) dst[c] = AVG2(above[c - 1], above[c]); dst += stride; // second row dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); dst += stride; // the rest of first col dst[0] = AVG3(above[-1], left[0], left[1]); - for (r = 3; r < bs; ++r) + for (r = 3; r < bh; ++r) dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); // the rest of the block - for (r = 2; r < bs; ++r) { - for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1]; + for (r = 2; r < bh; ++r) { + for (c = 1; c < bw; c++) dst[c] = dst[-2 * stride + c - 1]; dst += stride; } } -static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { +static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { int i; #if CONFIG_TX64X64 #if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 @@ -111,64 +116,65 @@ static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs, #endif #endif // CONFIG_TX64X64 - // dst(bs, bs - 2)[0], i.e., border starting at bottom-left - for (i = 0; i < bs - 2; ++i) { - border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]); + // dst(bh, bh - 2)[0], i.e., border starting at bottom-left + for (i = 0; i < bh - 2; ++i) { + border[i] = AVG3(left[bh - 3 - i], left[bh - 2 - i], left[bh - 1 - i]); } - border[bs - 2] = AVG3(above[-1], left[0], left[1]); - border[bs - 1] = AVG3(left[0], above[-1], above[0]); - border[bs - 0] = AVG3(above[-1], above[0], above[1]); + border[bh - 2] = AVG3(above[-1], left[0], left[1]); + border[bh - 1] = AVG3(left[0], above[-1], above[0]); + border[bh - 0] = AVG3(above[-1], above[0], above[1]); // dst[0][2, size), i.e., remaining top border ascending - for (i = 0; i < bs - 2; ++i) { - border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]); + for (i = 0; i < bw - 2; ++i) { + border[bh + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]); } - for (i = 0; i < bs; ++i) { - memcpy(dst + i * stride, border + bs - 1 - i, bs); + for (i = 0; i < bh; ++i) { + memcpy(dst + i * stride, border + bh - 1 - i, bw); } } -static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { +static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { int r, c; dst[0] = AVG2(above[-1], left[0]); - for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); + for (r = 1; r < bh; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); dst++; dst[0] = AVG3(left[0], above[-1], above[0]); dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bs; r++) + for (r = 2; r < bh; r++) dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); dst++; - for (c = 0; c < bs - 2; c++) + for (c = 0; c < bw - 2; c++) dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); dst += stride; - for (r = 1; r < bs; ++r) { - for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2]; + for (r = 1; r < bh; ++r) { + for (c = 0; c < bw - 2; c++) dst[c] = dst[-stride + c - 2]; dst += stride; } } -static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int r; (void)left; - for (r = 0; r < bs; r++) { - memcpy(dst, above, bs); + for (r = 0; r < bh; r++) { + memcpy(dst, above, bw); dst += stride; } } -static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int r; (void)above; - for (r = 0; r < bs; r++) { - memset(dst, left[r], bs); + for (r = 0; r < bh; r++) { + memset(dst, left[r], bw); dst += stride; } } @@ -189,13 +195,14 @@ static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top, : (p_top <= p_top_left) ? top : top_left; } -static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { +static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { int r, c; const uint8_t ytop_left = above[-1]; - for (r = 0; r < bs; r++) { - for (c = 0; c < bs; c++) + for (r = 0; r < bh; r++) { + for (c = 0; c < bw; c++) dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left); dst += stride; } @@ -236,32 +243,38 @@ static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { }; // Some basic checks on weights for smooth predictor. -#define sm_weights_sanity_checks(weights, weights_scale, pred_scale) \ - assert(weights[0] < weights_scale); \ - assert(weights_scale - weights[bs - 1] < weights_scale); \ +#define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \ + pred_scale) \ + assert(weights_w[0] < weights_scale); \ + assert(weights_h[0] < weights_scale); \ + assert(weights_scale - weights_w[bw - 1] < weights_scale); \ + assert(weights_scale - weights_h[bh - 1] < weights_scale); \ assert(pred_scale < 31) // ensures no overflow when calculating predictor. #define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits)) -static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - const uint8_t below_pred = left[bs - 1]; // estimated by bottom-left pixel - const uint8_t right_pred = above[bs - 1]; // estimated by top-right pixel - const uint8_t *const sm_weights = sm_weight_arrays + bs; +static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { + const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; // scale = 2 * 2^sm_weight_log2_scale const int log2_scale = 1 + sm_weight_log2_scale; const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale, + log2_scale + sizeof(*dst)); int r; - for (r = 0; r < bs; ++r) { + for (r = 0; r < bh; ++r) { int c; - for (c = 0; c < bs; ++c) { + for (c = 0; c < bw; ++c) { const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred }; - const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r], - sm_weights[c], scale - sm_weights[c] }; + const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r], + sm_weights_w[c], scale - sm_weights_w[c] }; uint32_t this_pred = 0; int i; - assert(scale >= sm_weights[r] && scale >= sm_weights[c]); + assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]); for (i = 0; i < 4; ++i) { this_pred += weights[i] * pixels[i]; } @@ -272,20 +285,21 @@ static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } #if CONFIG_SMOOTH_HV -static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, +static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, const uint8_t *left) { - const uint8_t below_pred = left[bs - 1]; // estimated by bottom-left pixel - const uint8_t *const sm_weights = sm_weight_arrays + bs; + const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint8_t *const sm_weights = sm_weight_arrays + bh; // scale = 2^sm_weight_log2_scale const int log2_scale = sm_weight_log2_scale; const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); int r; - for (r = 0; r < bs; r++) { + for (r = 0; r < bh; r++) { int c; - for (c = 0; c < bs; ++c) { + for (c = 0; c < bw; ++c) { const uint8_t pixels[] = { above[c], below_pred }; const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; uint32_t this_pred = 0; @@ -300,20 +314,21 @@ static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, } } -static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, +static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, const uint8_t *left) { - const uint8_t right_pred = above[bs - 1]; // estimated by top-right pixel - const uint8_t *const sm_weights = sm_weight_arrays + bs; + const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights = sm_weight_arrays + bw; // scale = 2^sm_weight_log2_scale const int log2_scale = sm_weight_log2_scale; const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); int r; - for (r = 0; r < bs; r++) { + for (r = 0; r < bh; r++) { int c; - for (c = 0; c < bs; ++c) { + for (c = 0; c < bw; ++c) { const uint8_t pixels[] = { left[r], right_pred }; const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; uint32_t this_pred = 0; @@ -331,74 +346,78 @@ static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, #else -static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int r, c; int ytop_left = above[-1]; - for (r = 0; r < bs; r++) { - for (c = 0; c < bs; c++) + for (r = 0; r < bh; r++) { + for (c = 0; c < bw; c++) dst[c] = clip_pixel(left[r] + above[c] - ytop_left); dst += stride; } } #endif // CONFIG_ALT_INTRA -static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { +static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { int r; (void)above; (void)left; - for (r = 0; r < bs; r++) { - memset(dst, 128, bs); + for (r = 0; r < bh; r++) { + memset(dst, 128, bw); dst += stride; } } -static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, +static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; (void)above; - for (i = 0; i < bs; i++) sum += left[i]; - expected_dc = (sum + (bs >> 1)) / bs; + for (i = 0; i < bh; i++) sum += left[i]; + expected_dc = (sum + (bh >> 1)) / bh; - for (r = 0; r < bs; r++) { - memset(dst, expected_dc, bs); + for (r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); dst += stride; } } -static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { +static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left) { int i, r, expected_dc, sum = 0; (void)left; - for (i = 0; i < bs; i++) sum += above[i]; - expected_dc = (sum + (bs >> 1)) / bs; + for (i = 0; i < bw; i++) sum += above[i]; + expected_dc = (sum + (bw >> 1)) / bw; - for (r = 0; r < bs; r++) { - memset(dst, expected_dc, bs); + for (r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); dst += stride; } } -static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int i, r, expected_dc, sum = 0; - const int count = 2 * bs; + const int count = bw + bh; - for (i = 0; i < bs; i++) { + for (i = 0; i < bw; i++) { sum += above[i]; + } + for (i = 0; i < bh; i++) { sum += left[i]; } expected_dc = (sum + (count >> 1)) / count; - for (r = 0; r < bs; r++) { - memset(dst, expected_dc, bs); + for (r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); dst += stride; } } @@ -546,14 +565,14 @@ void aom_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, #if CONFIG_HIGHBITDEPTH static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r, c; (void)above; (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { + for (r = 0; r < bh; ++r) { + for (c = 0; c < bw; ++c) { dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], left[(c >> 1) + r + 2]) : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); @@ -563,13 +582,13 @@ static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride, } static INLINE void highbd_d63e_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r, c; (void)left; (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { + for (r = 0; r < bh; ++r) { + for (c = 0; c < bw; ++c) { dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], above[(r >> 1) + c + 2]) : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); @@ -579,111 +598,111 @@ static INLINE void highbd_d63e_predictor(uint16_t *dst, ptrdiff_t stride, } static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r, c; (void)left; (void)bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { + for (r = 0; r < bh; ++r) { + for (c = 0; c < bw; ++c) { dst[c] = AVG3(above[r + c], above[r + c + 1], - above[r + c + 1 + (r + c + 2 < bs * 2)]); + above[r + c + 1 + (r + c + 2 < bw + bh)]); } dst += stride; } } static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r, c; (void)bd; // first row - for (c = 0; c < bs; c++) dst[c] = AVG2(above[c - 1], above[c]); + for (c = 0; c < bw; c++) dst[c] = AVG2(above[c - 1], above[c]); dst += stride; // second row dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); dst += stride; // the rest of first col dst[0] = AVG3(above[-1], left[0], left[1]); - for (r = 3; r < bs; ++r) + for (r = 3; r < bh; ++r) dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); // the rest of the block - for (r = 2; r < bs; ++r) { - for (c = 1; c < bs; c++) dst[c] = dst[-2 * stride + c - 1]; + for (r = 2; r < bh; ++r) { + for (c = 1; c < bw; c++) dst[c] = dst[-2 * stride + c - 1]; dst += stride; } } static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r, c; (void)bd; dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); + for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bs; ++r) + for (r = 2; r < bh; ++r) dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); dst += stride; - for (r = 1; r < bs; ++r) { - for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1]; + for (r = 1; r < bh; ++r) { + for (c = 1; c < bw; c++) dst[c] = dst[-stride + c - 1]; dst += stride; } } static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r, c; (void)bd; dst[0] = AVG2(above[-1], left[0]); - for (r = 1; r < bs; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); + for (r = 1; r < bh; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); dst++; dst[0] = AVG3(left[0], above[-1], above[0]); dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bs; r++) + for (r = 2; r < bh; r++) dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); dst++; - for (c = 0; c < bs - 2; c++) + for (c = 0; c < bw - 2; c++) dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); dst += stride; - for (r = 1; r < bs; ++r) { - for (c = 0; c < bs - 2; c++) dst[c] = dst[-stride + c - 2]; + for (r = 1; r < bh; ++r) { + for (c = 0; c < bw - 2; c++) dst[c] = dst[-stride + c - 2]; dst += stride; } } -static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs, - const uint16_t *above, +static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, const uint16_t *left, int bd) { int r; (void)left; (void)bd; - for (r = 0; r < bs; r++) { - memcpy(dst, above, bs * sizeof(uint16_t)); + for (r = 0; r < bh; r++) { + memcpy(dst, above, bw * sizeof(uint16_t)); dst += stride; } } -static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs, - const uint16_t *above, +static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, const uint16_t *left, int bd) { int r; (void)above; (void)bd; - for (r = 0; r < bs; r++) { - aom_memset16(dst, left[r], bs); + for (r = 0; r < bh; r++) { + aom_memset16(dst, left[r], bw); dst += stride; } } @@ -777,39 +796,42 @@ void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, #if CONFIG_ALT_INTRA static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { int r, c; const uint16_t ytop_left = above[-1]; (void)bd; - for (r = 0; r < bs; r++) { - for (c = 0; c < bs; c++) + for (r = 0; r < bh; r++) { + for (c = 0; c < bw; c++) dst[c] = paeth_predictor_single(left[r], above[c], ytop_left); dst += stride; } } static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, + const uint16_t *above, const uint16_t *left, int bd) { - const uint16_t below_pred = left[bs - 1]; // estimated by bottom-left pixel - const uint16_t right_pred = above[bs - 1]; // estimated by top-right pixel - const uint8_t *const sm_weights = sm_weight_arrays + bs; + const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; // scale = 2 * 2^sm_weight_log2_scale const int log2_scale = 1 + sm_weight_log2_scale; const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale, + log2_scale + sizeof(*dst)); int r; - for (r = 0; r < bs; ++r) { + for (r = 0; r < bh; ++r) { int c; - for (c = 0; c < bs; ++c) { + for (c = 0; c < bw; ++c) { const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred }; - const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r], - sm_weights[c], scale - sm_weights[c] }; + const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r], + sm_weights_w[c], scale - sm_weights_w[c] }; uint32_t this_pred = 0; int i; - assert(scale >= sm_weights[r] && scale >= sm_weights[c]); + assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]); for (i = 0; i < 4; ++i) { this_pred += weights[i] * pixels[i]; } @@ -821,19 +843,21 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, #if CONFIG_SMOOTH_HV static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, + const uint16_t *above, const uint16_t *left, int bd) { - const uint16_t below_pred = left[bs - 1]; // estimated by bottom-left pixel - const uint8_t *const sm_weights = sm_weight_arrays + bs; + const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel + const uint8_t *const sm_weights = sm_weight_arrays + bh; // scale = 2^sm_weight_log2_scale const int log2_scale = sm_weight_log2_scale; const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); int r; - for (r = 0; r < bs; r++) { + for (r = 0; r < bh; r++) { int c; - for (c = 0; c < bs; ++c) { + for (c = 0; c < bw; ++c) { const uint16_t pixels[] = { above[c], below_pred }; const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] }; uint32_t this_pred = 0; @@ -849,19 +873,21 @@ static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride, } static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, + const uint16_t *above, const uint16_t *left, int bd) { - const uint16_t right_pred = above[bs - 1]; // estimated by top-right pixel - const uint8_t *const sm_weights = sm_weight_arrays + bs; + const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel + const uint8_t *const sm_weights = sm_weight_arrays + bw; // scale = 2^sm_weight_log2_scale const int log2_scale = sm_weight_log2_scale; const uint16_t scale = (1 << sm_weight_log2_scale); - sm_weights_sanity_checks(sm_weights, scale, log2_scale + sizeof(*dst)); + sm_weights_sanity_checks(sm_weights, sm_weights, scale, + log2_scale + sizeof(*dst)); int r; - for (r = 0; r < bs; r++) { + for (r = 0; r < bh; r++) { int c; - for (c = 0; c < bs; ++c) { + for (c = 0; c < bw; ++c) { const uint16_t pixels[] = { left[r], right_pred }; const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] }; uint32_t this_pred = 0; @@ -878,15 +904,15 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, #endif #else -static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs, - const uint16_t *above, +static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, const uint16_t *left, int bd) { int r, c; int ytop_left = above[-1]; (void)bd; - for (r = 0; r < bs; r++) { - for (c = 0; c < bs; c++) + for (r = 0; r < bh; r++) { + for (c = 0; c < bw; c++) dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd); dst += stride; } @@ -894,66 +920,71 @@ static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs, #endif // CONFIG_ALT_INTRA static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, + const uint16_t *above, const uint16_t *left, int bd) { int r; (void)above; (void)left; - for (r = 0; r < bs; r++) { - aom_memset16(dst, 128 << (bd - 8), bs); + for (r = 0; r < bh; r++) { + aom_memset16(dst, 128 << (bd - 8), bw); dst += stride; } } static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, + const uint16_t *above, const uint16_t *left, int bd) { int i, r, expected_dc, sum = 0; (void)above; (void)bd; - for (i = 0; i < bs; i++) sum += left[i]; - expected_dc = (sum + (bs >> 1)) / bs; + for (i = 0; i < bh; i++) sum += left[i]; + expected_dc = (sum + (bh >> 1)) / bh; - for (r = 0; r < bs; r++) { - aom_memset16(dst, expected_dc, bs); + for (r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); dst += stride; } } static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, + int bw, int bh, + const uint16_t *above, const uint16_t *left, int bd) { int i, r, expected_dc, sum = 0; (void)left; (void)bd; - for (i = 0; i < bs; i++) sum += above[i]; - expected_dc = (sum + (bs >> 1)) / bs; + for (i = 0; i < bw; i++) sum += above[i]; + expected_dc = (sum + (bw >> 1)) / bw; - for (r = 0; r < bs; r++) { - aom_memset16(dst, expected_dc, bs); + for (r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); dst += stride; } } -static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, - const uint16_t *above, +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, const uint16_t *left, int bd) { int i, r, expected_dc, sum = 0; - const int count = 2 * bs; + const int count = bw + bh; (void)bd; - for (i = 0; i < bs; i++) { + for (i = 0; i < bw; i++) { sum += above[i]; + } + for (i = 0; i < bh; i++) { sum += left[i]; } expected_dc = (sum + (count >> 1)) / count; - for (r = 0; r < bs; r++) { - aom_memset16(dst, expected_dc, bs); + for (r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); dst += stride; } } @@ -962,99 +993,121 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bs, // This serves as a wrapper function, so that all the prediction functions // can be unified and accessed as a pointer array. Note that the boundary // above and left are not necessarily used all the time. -#define intra_pred_sized(type, size) \ - void aom_##type##_predictor_##size##x##size##_c( \ - uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \ - const uint8_t *left) { \ - type##_predictor(dst, stride, size, above, left); \ +#define intra_pred_sized(type, width, height) \ + void aom_##type##_predictor_##width##x##height##_c( \ + uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \ + const uint8_t *left) { \ + type##_predictor(dst, stride, width, height, above, left); \ } #if CONFIG_HIGHBITDEPTH -#define intra_pred_highbd_sized(type, size) \ - void aom_highbd_##type##_predictor_##size##x##size##_c( \ - uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ - const uint16_t *left, int bd) { \ - highbd_##type##_predictor(dst, stride, size, above, left, bd); \ +#define intra_pred_highbd_sized(type, width, height) \ + void aom_highbd_##type##_predictor_##width##x##height##_c( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \ } /* clang-format off */ #if CONFIG_TX64X64 -#define intra_pred_allsizes(type) \ - intra_pred_sized(type, 2) \ - intra_pred_sized(type, 4) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) \ - intra_pred_sized(type, 64) \ - intra_pred_highbd_sized(type, 2) \ - intra_pred_highbd_sized(type, 4) \ - intra_pred_highbd_sized(type, 8) \ - intra_pred_highbd_sized(type, 16) \ - intra_pred_highbd_sized(type, 32) \ - intra_pred_highbd_sized(type, 64) - +#define intra_pred_rectangular(type) \ + intra_pred_sized(type, 4, 8) \ + intra_pred_sized(type, 8, 4) \ + intra_pred_sized(type, 8, 16) \ + intra_pred_sized(type, 16, 8) \ + intra_pred_sized(type, 16, 32) \ + intra_pred_sized(type, 32, 16) \ + intra_pred_highbd_sized(type, 4, 8) \ + intra_pred_highbd_sized(type, 8, 4) \ + intra_pred_highbd_sized(type, 8, 16) \ + intra_pred_highbd_sized(type, 16, 8) \ + intra_pred_highbd_sized(type, 16, 32) \ + intra_pred_highbd_sized(type, 32, 16) #define intra_pred_above_4x4(type) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) \ - intra_pred_sized(type, 64) \ - intra_pred_highbd_sized(type, 4) \ - intra_pred_highbd_sized(type, 8) \ - intra_pred_highbd_sized(type, 16) \ - intra_pred_highbd_sized(type, 32) \ - intra_pred_highbd_sized(type, 64) -#else // CONFIG_TX64X64 + intra_pred_sized(type, 8, 8) \ + intra_pred_sized(type, 16, 16) \ + intra_pred_sized(type, 32, 32) \ + intra_pred_sized(type, 64, 64) \ + intra_pred_highbd_sized(type, 4, 4) \ + intra_pred_highbd_sized(type, 8, 8) \ + intra_pred_highbd_sized(type, 16, 16) \ + intra_pred_highbd_sized(type, 32, 32) \ + intra_pred_highbd_sized(type, 64, 64) \ + intra_pred_rectangular(type) #define intra_pred_allsizes(type) \ - intra_pred_sized(type, 2) \ - intra_pred_sized(type, 4) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) \ - intra_pred_highbd_sized(type, 2) \ - intra_pred_highbd_sized(type, 4) \ - intra_pred_highbd_sized(type, 8) \ - intra_pred_highbd_sized(type, 16) \ - intra_pred_highbd_sized(type, 32) - + intra_pred_sized(type, 2, 2) \ + intra_pred_sized(type, 4, 4) \ + intra_pred_highbd_sized(type, 2, 2) \ + intra_pred_above_4x4(type) +#else // CONFIG_TX64X64 +#define intra_pred_rectangular(type) \ + intra_pred_sized(type, 4, 8) \ + intra_pred_sized(type, 8, 4) \ + intra_pred_sized(type, 8, 16) \ + intra_pred_sized(type, 16, 8) \ + intra_pred_sized(type, 16, 32) \ + intra_pred_sized(type, 32, 16) \ + intra_pred_highbd_sized(type, 4, 8) \ + intra_pred_highbd_sized(type, 8, 4) \ + intra_pred_highbd_sized(type, 8, 16) \ + intra_pred_highbd_sized(type, 16, 8) \ + intra_pred_highbd_sized(type, 16, 32) \ + intra_pred_highbd_sized(type, 32, 16) #define intra_pred_above_4x4(type) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) \ - intra_pred_highbd_sized(type, 4) \ - intra_pred_highbd_sized(type, 8) \ - intra_pred_highbd_sized(type, 16) \ - intra_pred_highbd_sized(type, 32) + intra_pred_sized(type, 8, 8) \ + intra_pred_sized(type, 16, 16) \ + intra_pred_sized(type, 32, 32) \ + intra_pred_highbd_sized(type, 4, 4) \ + intra_pred_highbd_sized(type, 8, 8) \ + intra_pred_highbd_sized(type, 16, 16) \ + intra_pred_highbd_sized(type, 32, 32) \ + intra_pred_rectangular(type) +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 2, 2) \ + intra_pred_sized(type, 4, 4) \ + intra_pred_highbd_sized(type, 2, 2) \ + intra_pred_above_4x4(type) #endif // CONFIG_TX64X64 #else #if CONFIG_TX64X64 -#define intra_pred_allsizes(type) \ - intra_pred_sized(type, 2) \ - intra_pred_sized(type, 4) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) \ - intra_pred_sized(type, 64) - +#define intra_pred_rectangular(type) \ + intra_pred_sized(type, 4, 8) \ + intra_pred_sized(type, 8, 4) \ + intra_pred_sized(type, 8, 16) \ + intra_pred_sized(type, 16, 8) \ + intra_pred_sized(type, 16, 32) \ + intra_pred_sized(type, 32, 16) #define intra_pred_above_4x4(type) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) \ - intra_pred_sized(type, 64) -#else // CONFIG_TX64X64 + intra_pred_sized(type, 8, 8) \ + intra_pred_sized(type, 16, 16) \ + intra_pred_sized(type, 32, 32) \ + intra_pred_sized(type, 64, 64) \ + intra_pred_rectangular(type) #define intra_pred_allsizes(type) \ - intra_pred_sized(type, 2) \ - intra_pred_sized(type, 4) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) - + intra_pred_sized(type, 2, 2) \ + intra_pred_sized(type, 4, 4) \ + intra_pred_above_4x4(type) +#else // CONFIG_TX64X64 +#define intra_pred_rectangular(type) \ + intra_pred_sized(type, 4, 8) \ + intra_pred_sized(type, 8, 4) \ + intra_pred_sized(type, 8, 16) \ + intra_pred_sized(type, 16, 8) \ + intra_pred_sized(type, 16, 32) \ + intra_pred_sized(type, 32, 16) #define intra_pred_above_4x4(type) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) + intra_pred_sized(type, 8, 8) \ + intra_pred_sized(type, 16, 16) \ + intra_pred_sized(type, 32, 32) \ + intra_pred_rectangular(type) +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 2, 2) \ + intra_pred_sized(type, 4, 4) \ + intra_pred_above_4x4(type) #endif // CONFIG_TX64X64 + #endif // CONFIG_HIGHBITDEPTH intra_pred_allsizes(d207e) diff --git a/third_party/aom/aom_dsp/inv_txfm.c b/third_party/aom/aom_dsp/inv_txfm.c index 6e7d8c928..398eb0a12 100644 --- a/third_party/aom/aom_dsp/inv_txfm.c +++ b/third_party/aom/aom_dsp/inv_txfm.c @@ -14,6 +14,9 @@ #include "./aom_dsp_rtcd.h" #include "aom_dsp/inv_txfm.h" +#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 +#include "av1/common/daala_tx.h" +#endif void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, @@ -93,6 +96,18 @@ void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { } } +#if CONFIG_DAALA_DCT4 +void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { + int i; + od_coeff x[4]; + od_coeff y[4]; + for (i = 0; i < 4; i++) y[i] = input[i]; + od_bin_idct4(x, 1, y); + for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i]; +} + +#else + void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { tran_low_t step[4]; tran_high_t temp1, temp2; @@ -112,6 +127,7 @@ void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { output[2] = WRAPLOW(step[1] - step[2]); output[3] = WRAPLOW(step[0] - step[3]); } +#endif void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[4 * 4]; @@ -156,6 +172,18 @@ void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, } } +#if CONFIG_DAALA_DCT8 +void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { + int i; + od_coeff x[8]; + od_coeff y[8]; + for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i]; + od_bin_idct8(x, 1, y); + for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i]; +} + +#else + void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { tran_low_t step1[8], step2[8]; tran_high_t temp1, temp2; @@ -209,6 +237,7 @@ void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { output[6] = WRAPLOW(step1[1] - step1[6]); output[7] = WRAPLOW(step1[0] - step1[7]); } +#endif void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[8 * 8]; @@ -284,6 +313,18 @@ void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) { output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); } +#if CONFIG_DAALA_DCT8 +void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { + int i; + od_coeff x[8]; + od_coeff y[8]; + for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i]; + od_bin_idst8(x, 1, y); + for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i]; +} + +#else + void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; @@ -361,6 +402,8 @@ void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { output[7] = WRAPLOW(-x1); } +#endif + void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; @@ -1179,6 +1222,109 @@ void aom_idct32_c(const tran_low_t *input, tran_low_t *output) { output[31] = WRAPLOW(step1[0] - step1[31]); } +#if CONFIG_MRC_TX +void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, + int stride, int *mask) { + tran_low_t out[32 * 32]; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + for (i = 0; i < 32; ++i) { + int16_t zero_coeff[16]; + for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; + for (j = 0; j < 8; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 4; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + for (j = 0; j < 2; ++j) + zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; + + if (zero_coeff[0] | zero_coeff[1]) + aom_idct32_c(input, outptr); + else + memset(outptr, 0, sizeof(tran_low_t) * 32); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + aom_idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + // Only add the coefficient if the mask value is 1 + int mask_val = mask[j * 32 + i]; + dest[j * stride + i] = + mask_val ? clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)) + : dest[j * stride + i]; + } + } +} + +void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int *mask) { + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 16x16 has non-zero coeff + for (i = 0; i < 16; ++i) { + aom_idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + aom_idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + // Only add the coefficient if the mask value is 1 + int mask_val = mask[j * 32 + i]; + dest[j * stride + i] = + mask_val ? clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)) + : dest[j * stride + i]; + } + } +} + +void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int *mask) { + tran_low_t out[32 * 32] = { 0 }; + tran_low_t *outptr = out; + int i, j; + tran_low_t temp_in[32], temp_out[32]; + + // Rows + // only upper-left 8x8 has non-zero coeff + for (i = 0; i < 8; ++i) { + aom_idct32_c(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; + aom_idct32_c(temp_in, temp_out); + for (j = 0; j < 32; ++j) { + // Only add the coefficient if the mask value is 1 + int mask_val = mask[j * 32 + i]; + dest[j * stride + i] = + mask_val ? clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)) + : dest[j * stride + i]; + } + } +} +#endif // CONFIG_MRC_TX + void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[32 * 32]; @@ -1283,7 +1429,6 @@ void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } -#if CONFIG_HIGHBITDEPTH void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, @@ -1374,936 +1519,3 @@ void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, dest++; } } - -void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step[4]; - tran_high_t temp1, temp2; - (void)bd; - // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - // stage 2 - output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); - output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); - output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); - output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); -} - -void aom_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[4], temp_out[4]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // Rows - for (i = 0; i < 4; ++i) { - aom_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - aom_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } -} - -void aom_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, - int dest_stride, int bd) { - int i; - tran_high_t a1; - tran_low_t out = - HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); - a1 = ROUND_POWER_OF_TWO(out, 4); - - for (i = 0; i < 4; i++) { - dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); - dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); - dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); - dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); - dest += dest_stride; - } -} - -void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step1[8], step2[8]; - tran_high_t temp1, temp2; - // stage 1 - step1[0] = input[0]; - step1[2] = input[4]; - step1[1] = input[2]; - step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - // stage 2 & stage 3 - even half - aom_highbd_idct4_c(step1, step1, bd); - - // stage 2 - odd half - step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); - step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); - - // stage 3 - odd half - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step1[7] = step2[7]; - - // stage 4 - output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); - output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); - output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); - output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); - output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); - output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); - output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); - output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); -} - -void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[0]; - tran_low_t x1 = input[1]; - tran_low_t x2 = input[2]; - tran_low_t x3 = input[3]; - (void)bd; - - if (!(x0 | x1 | x2 | x3)) { - memset(output, 0, 4 * sizeof(*output)); - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); - - s0 = s0 + s3 + s5; - s1 = s1 - s4 - s6; - s3 = s2; - s2 = sinpi_3_9 * s7; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd); - output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd); - output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); - output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); -} - -void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[7]; - tran_low_t x1 = input[0]; - tran_low_t x2 = input[5]; - tran_low_t x3 = input[2]; - tran_low_t x4 = input[3]; - tran_low_t x5 = input[4]; - tran_low_t x6 = input[1]; - tran_low_t x7 = input[6]; - (void)bd; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - memset(output, 0, 8 * sizeof(*output)); - return; - } - - // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - - x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); - x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); - x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd); - x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd); - x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd); - x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - - x0 = HIGHBD_WRAPLOW(s0 + s2, bd); - x1 = HIGHBD_WRAPLOW(s1 + s3, bd); - x2 = HIGHBD_WRAPLOW(s0 - s2, bd); - x3 = HIGHBD_WRAPLOW(s1 - s3, bd); - x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); - x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); - - // stage 3 - s2 = cospi_16_64 * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); - - x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); - x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); - - output[0] = HIGHBD_WRAPLOW(x0, bd); - output[1] = HIGHBD_WRAPLOW(-x4, bd); - output[2] = HIGHBD_WRAPLOW(x6, bd); - output[3] = HIGHBD_WRAPLOW(-x2, bd); - output[4] = HIGHBD_WRAPLOW(x3, bd); - output[5] = HIGHBD_WRAPLOW(-x7, bd); - output[6] = HIGHBD_WRAPLOW(x5, bd); - output[7] = HIGHBD_WRAPLOW(-x1, bd); -} - -void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step1[16], step2[16]; - tran_high_t temp1, temp2; - (void)bd; - - // stage 1 - step1[0] = input[0 / 2]; - step1[1] = input[16 / 2]; - step1[2] = input[8 / 2]; - step1[3] = input[24 / 2]; - step1[4] = input[4 / 2]; - step1[5] = input[20 / 2]; - step1[6] = input[12 / 2]; - step1[7] = input[28 / 2]; - step1[8] = input[2 / 2]; - step1[9] = input[18 / 2]; - step1[10] = input[10 / 2]; - step1[11] = input[26 / 2]; - step1[12] = input[6 / 2]; - step1[13] = input[22 / 2]; - step1[14] = input[14 / 2]; - step1[15] = input[30 / 2]; - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); - step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); - step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); - step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); - step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); - step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); - step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); - step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); - step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step2[11] = step1[11]; - step2[12] = step1[12]; - - // stage 5 - step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); - step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); - step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); - step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step1[7] = step2[7]; - - step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); - step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); - step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); - step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); - step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); - step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); - step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); - step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); - - // stage 6 - step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); - step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); - step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); - step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); - step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); - step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); - step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step2[14] = step1[14]; - step2[15] = step1[15]; - - // stage 7 - output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); - output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); - output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); - output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); - output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); - output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); - output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); - output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); - output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); - output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); - output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); - output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); - output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); - output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); - output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); - output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); -} - -void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - - tran_low_t x0 = input[15]; - tran_low_t x1 = input[0]; - tran_low_t x2 = input[13]; - tran_low_t x3 = input[2]; - tran_low_t x4 = input[11]; - tran_low_t x5 = input[4]; - tran_low_t x6 = input[9]; - tran_low_t x7 = input[6]; - tran_low_t x8 = input[7]; - tran_low_t x9 = input[8]; - tran_low_t x10 = input[5]; - tran_low_t x11 = input[10]; - tran_low_t x12 = input[3]; - tran_low_t x13 = input[12]; - tran_low_t x14 = input[1]; - tran_low_t x15 = input[14]; - (void)bd; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - memset(output, 0, 16 * sizeof(*output)); - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); - x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); - x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); - x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); - x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); - x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); - x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); - x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); - x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); - x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); - x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); - x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = HIGHBD_WRAPLOW(s0 + s4, bd); - x1 = HIGHBD_WRAPLOW(s1 + s5, bd); - x2 = HIGHBD_WRAPLOW(s2 + s6, bd); - x3 = HIGHBD_WRAPLOW(s3 + s7, bd); - x4 = HIGHBD_WRAPLOW(s0 - s4, bd); - x5 = HIGHBD_WRAPLOW(s1 - s5, bd); - x6 = HIGHBD_WRAPLOW(s2 - s6, bd); - x7 = HIGHBD_WRAPLOW(s3 - s7, bd); - x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); - x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); - x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); - x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); - x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); - x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = HIGHBD_WRAPLOW(s0 + s2, bd); - x1 = HIGHBD_WRAPLOW(s1 + s3, bd); - x2 = HIGHBD_WRAPLOW(s0 - s2, bd); - x3 = HIGHBD_WRAPLOW(s1 - s3, bd); - x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); - x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); - x8 = HIGHBD_WRAPLOW(s8 + s10, bd); - x9 = HIGHBD_WRAPLOW(s9 + s11, bd); - x10 = HIGHBD_WRAPLOW(s8 - s10, bd); - x11 = HIGHBD_WRAPLOW(s9 - s11, bd); - x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); - x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); - x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); - x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); - x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); - - output[0] = HIGHBD_WRAPLOW(x0, bd); - output[1] = HIGHBD_WRAPLOW(-x8, bd); - output[2] = HIGHBD_WRAPLOW(x12, bd); - output[3] = HIGHBD_WRAPLOW(-x4, bd); - output[4] = HIGHBD_WRAPLOW(x6, bd); - output[5] = HIGHBD_WRAPLOW(x14, bd); - output[6] = HIGHBD_WRAPLOW(x10, bd); - output[7] = HIGHBD_WRAPLOW(x2, bd); - output[8] = HIGHBD_WRAPLOW(x3, bd); - output[9] = HIGHBD_WRAPLOW(x11, bd); - output[10] = HIGHBD_WRAPLOW(x15, bd); - output[11] = HIGHBD_WRAPLOW(x7, bd); - output[12] = HIGHBD_WRAPLOW(x5, bd); - output[13] = HIGHBD_WRAPLOW(-x13, bd); - output[14] = HIGHBD_WRAPLOW(x9, bd); - output[15] = HIGHBD_WRAPLOW(-x1, bd); -} - -void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_low_t step1[32], step2[32]; - tran_high_t temp1, temp2; - (void)bd; - - // stage 1 - step1[0] = input[0]; - step1[1] = input[16]; - step1[2] = input[8]; - step1[3] = input[24]; - step1[4] = input[4]; - step1[5] = input[20]; - step1[6] = input[12]; - step1[7] = input[28]; - step1[8] = input[2]; - step1[9] = input[18]; - step1[10] = input[10]; - step1[11] = input[26]; - step1[12] = input[6]; - step1[13] = input[22]; - step1[14] = input[14]; - step1[15] = input[30]; - - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); - step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); - step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd); - step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd); - step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd); - step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd); - step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd); - step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd); - step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd); - step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd); - step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd); - step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd); - step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd); - step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd); - step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd); - step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - - step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); - step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); - step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); - step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); - step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); - step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); - step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); - step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); - - step1[16] = step2[16]; - step1[31] = step2[31]; - temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; - temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; - temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step1[19] = step2[19]; - step1[20] = step2[20]; - temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; - temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; - temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[27] = step2[27]; - step1[28] = step2[28]; - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); - step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); - step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step2[11] = step1[11]; - step2[12] = step1[12]; - - step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd); - step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd); - step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd); - step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd); - step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd); - step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd); - step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd); - step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd); - - step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd); - step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd); - step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd); - step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd); - step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd); - step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd); - step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd); - step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd); - - // stage 5 - step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); - step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); - step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); - step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step1[7] = step2[7]; - - step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); - step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); - step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); - step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); - step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); - step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); - step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); - step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); - - step1[16] = step2[16]; - step1[17] = step2[17]; - temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; - temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; - temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; - temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; - temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step1[22] = step2[22]; - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[25] = step2[25]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // stage 6 - step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); - step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); - step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); - step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); - step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); - step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); - step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); - step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step2[14] = step1[14]; - step2[15] = step1[15]; - - step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd); - step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd); - step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd); - step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd); - step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd); - step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd); - step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd); - step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd); - - step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd); - step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd); - step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd); - step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd); - step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd); - step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd); - step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd); - step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd); - - // stage 7 - step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); - step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); - step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); - step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); - step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); - step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); - step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); - step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); - step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); - step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); - step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); - step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); - step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); - step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); - step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); - step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); - - step1[16] = step2[16]; - step1[17] = step2[17]; - step1[18] = step2[18]; - step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * cospi_16_64; - temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step2[21] + step2[26]) * cospi_16_64; - temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step2[22] + step2[25]) * cospi_16_64; - temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - temp1 = (-step2[23] + step2[24]) * cospi_16_64; - temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); - step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); - step1[28] = step2[28]; - step1[29] = step2[29]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // final stage - output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd); - output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd); - output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd); - output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd); - output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd); - output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd); - output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd); - output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd); - output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd); - output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd); - output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd); - output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd); - output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd); - output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd); - output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd); - output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd); - output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd); - output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd); - output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd); - output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd); - output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd); - output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd); - output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd); - output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd); - output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd); - output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd); - output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd); - output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd); - output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd); - output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd); - output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd); - output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); -} - -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/inv_txfm.h b/third_party/aom/aom_dsp/inv_txfm.h index e64d463ea..a9c485e74 100644 --- a/third_party/aom/aom_dsp/inv_txfm.h +++ b/third_party/aom/aom_dsp/inv_txfm.h @@ -23,8 +23,7 @@ extern "C" { #endif static INLINE tran_high_t dct_const_round_shift(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return rv; + return ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); } static INLINE tran_high_t check_range(tran_high_t input, int bd) { @@ -51,9 +50,19 @@ static INLINE tran_high_t check_range(tran_high_t input, int bd) { } #define WRAPLOW(x) ((int32_t)check_range(x, 8)) -#if CONFIG_HIGHBITDEPTH #define HIGHBD_WRAPLOW(x, bd) ((int32_t)check_range((x), bd)) -#endif // CONFIG_HIGHBITDEPTH + +#if CONFIG_MRC_TX +// These each perform dct but add coefficients based on a mask +void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, + int stride, int *mask); + +void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int *mask); + +void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, + int *mask); +#endif // CONFIG_MRC_TX void aom_idct4_c(const tran_low_t *input, tran_low_t *output); void aom_idct8_c(const tran_low_t *input, tran_low_t *output); @@ -63,7 +72,6 @@ void aom_iadst4_c(const tran_low_t *input, tran_low_t *output); void aom_iadst8_c(const tran_low_t *input, tran_low_t *output); void aom_iadst16_c(const tran_low_t *input, tran_low_t *output); -#if CONFIG_HIGHBITDEPTH void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd); void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd); void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd); @@ -78,7 +86,6 @@ static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, trans = HIGHBD_WRAPLOW(trans, bd); return clip_pixel_highbd(dest + (int)trans, bd); } -#endif static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { trans = WRAPLOW(trans); diff --git a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c index dc9c63226..43dce8ba6 100644 --- a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c +++ b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c @@ -926,23 +926,3 @@ void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out, out + (8 * i * 32)); } } - -void aom_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - int sum = LD_HADD(input, stride); - sum += LD_HADD(input + 8, stride); - sum += LD_HADD(input + 16, stride); - sum += LD_HADD(input + 24, stride); - sum += LD_HADD(input + 32 * 8, stride); - sum += LD_HADD(input + 32 * 8 + 8, stride); - sum += LD_HADD(input + 32 * 8 + 16, stride); - sum += LD_HADD(input + 32 * 8 + 24, stride); - sum += LD_HADD(input + 32 * 16, stride); - sum += LD_HADD(input + 32 * 16 + 8, stride); - sum += LD_HADD(input + 32 * 16 + 16, stride); - sum += LD_HADD(input + 32 * 16 + 24, stride); - sum += LD_HADD(input + 32 * 24, stride); - sum += LD_HADD(input + 32 * 24 + 8, stride); - sum += LD_HADD(input + 32 * 24 + 16, stride); - sum += LD_HADD(input + 32 * 24 + 24, stride); - out[0] = (int16_t)(sum >> 3); -} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c index f16d290c8..7a285b7b8 100644 --- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c +++ b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c @@ -236,11 +236,3 @@ void aom_fdct16x16_msa(const int16_t *input, int16_t *output, fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); } } - -void aom_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - int sum = LD_HADD(input, stride); - sum += LD_HADD(input + 8, stride); - sum += LD_HADD(input + 16 * 8, stride); - sum += LD_HADD(input + 16 * 8 + 8, stride); - out[0] = (int16_t)(sum >> 1); -} diff --git a/third_party/aom/aom_dsp/prob.c b/third_party/aom/aom_dsp/prob.c index eefe7521f..a42fb806b 100644 --- a/third_party/aom/aom_dsp/prob.c +++ b/third_party/aom/aom_dsp/prob.c @@ -186,10 +186,8 @@ int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs, for (i = 1; i < nsymbs; i++) { cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i - 1]) + cdf[i]); } -// Store symbol count at the end of the CDF -#if CONFIG_EC_ADAPT + // Store symbol count at the end of the CDF cdf[nsymbs] = 0; -#endif return nsymbs; } diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h index ec6654ab7..35db134e5 100644 --- a/third_party/aom/aom_dsp/prob.h +++ b/third_party/aom/aom_dsp/prob.h @@ -148,7 +148,6 @@ static INLINE void av1_tree_to_cdf(const aom_tree_index *tree, void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree); -#if CONFIG_EC_ADAPT static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { const int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs); const int rate2 = 5; @@ -183,7 +182,6 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { #endif cdf[nsymbs] += (cdf[nsymbs] < 32); } -#endif #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c index 0759c22e3..fe98b6028 100644 --- a/third_party/aom/aom_dsp/quantize.c +++ b/third_party/aom/aom_dsp/quantize.c @@ -256,7 +256,6 @@ void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, } #endif // CONFIG_TX64X64 -#if CONFIG_HIGHBITDEPTH void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, @@ -523,7 +522,6 @@ void aom_highbd_quantize_b_64x64_c( *eob_ptr = eob + 1; } #endif // CONFIG_TX64X64 -#endif // CONFIG_HIGHBITDEPTH #else // CONFIG_AOM_QM @@ -602,7 +600,6 @@ void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, } #endif // CONFIG_TX64X64 -#if CONFIG_HIGHBITDEPTH void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, @@ -716,8 +713,7 @@ void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, if (abs_coeff >= zbins[rc != 0]) { const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; if (abs_qcoeff) eob = i; @@ -767,8 +763,7 @@ void aom_highbd_quantize_b_32x32_c( const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = idx_arr[i]; @@ -818,8 +813,7 @@ void aom_highbd_quantize_b_64x64_c( const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; if (abs_qcoeff) eob = idx_arr[i]; @@ -828,5 +822,4 @@ void aom_highbd_quantize_b_64x64_c( *eob_ptr = eob + 1; } #endif // CONFIG_TX64X64 -#endif // CONFIG_HIGHBITDEPTH #endif // CONFIG_AOM_QM diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c index 2cc172ba5..b9c789ce5 100644 --- a/third_party/aom/aom_dsp/sad.c +++ b/third_party/aom/aom_dsp/sad.c @@ -153,10 +153,21 @@ sadMxN(4, 4) sadMxNxK(4, 4, 3) sadMxNxK(4, 4, 8) sadMxNx4D(4, 4) + +#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES +sadMxN(4, 16) +sadMxNx4D(4, 16) +sadMxN(16, 4) +sadMxNx4D(16, 4) +sadMxN(8, 32) +sadMxNx4D(8, 32) +sadMxN(32, 8) +sadMxNx4D(32, 8) +#endif /* clang-format on */ #if CONFIG_HIGHBITDEPTH - static INLINE + static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int y, x; @@ -307,11 +318,22 @@ highbd_sadMxN(4, 4) highbd_sadMxNxK(4, 4, 3) highbd_sadMxNxK(4, 4, 8) highbd_sadMxNx4D(4, 4) + +#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES +highbd_sadMxN(4, 16) +highbd_sadMxNx4D(4, 16) +highbd_sadMxN(16, 4) +highbd_sadMxNx4D(16, 4) +highbd_sadMxN(8, 32) +highbd_sadMxNx4D(8, 32) +highbd_sadMxN(32, 8) +highbd_sadMxNx4D(32, 8) +#endif /* clang-format on */ #endif // CONFIG_HIGHBITDEPTH #if CONFIG_AV1 && CONFIG_EXT_INTER - static INLINE + static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, @@ -367,10 +389,17 @@ MASKSADMxN(8, 8) MASKSADMxN(8, 4) MASKSADMxN(4, 8) MASKSADMxN(4, 4) + +#if CONFIG_EXT_PARTITION_TYPES +MASKSADMxN(4, 16) +MASKSADMxN(16, 4) +MASKSADMxN(8, 32) +MASKSADMxN(32, 8) +#endif /* clang-format on */ #if CONFIG_HIGHBITDEPTH - static INLINE + static INLINE unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, @@ -429,6 +458,13 @@ HIGHBD_MASKSADMXN(8, 8) HIGHBD_MASKSADMXN(8, 4) HIGHBD_MASKSADMXN(4, 8) HIGHBD_MASKSADMXN(4, 4) + +#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES +HIGHBD_MASKSADMXN(4, 16) +HIGHBD_MASKSADMXN(16, 4) +HIGHBD_MASKSADMXN(8, 32) +HIGHBD_MASKSADMXN(32, 8) +#endif #endif // CONFIG_HIGHBITDEPTH #endif // CONFIG_AV1 && CONFIG_EXT_INTER @@ -480,10 +516,17 @@ OBMCSADMxN(8, 8) OBMCSADMxN(8, 4) OBMCSADMxN(4, 8) OBMCSADMxN(4, 4) + +#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES +OBMCSADMxN(4, 16) +OBMCSADMxN(16, 4) +OBMCSADMxN(8, 32) +OBMCSADMxN(32, 8) +#endif /* clang-format on */ #if CONFIG_HIGHBITDEPTH - static INLINE + static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height) { @@ -529,6 +572,13 @@ HIGHBD_OBMCSADMXN(8, 8) HIGHBD_OBMCSADMXN(8, 4) HIGHBD_OBMCSADMXN(4, 8) HIGHBD_OBMCSADMXN(4, 4) + +#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES +HIGHBD_OBMCSADMXN(4, 16) +HIGHBD_OBMCSADMXN(16, 4) +HIGHBD_OBMCSADMXN(8, 32) +HIGHBD_OBMCSADMXN(32, 8) +#endif /* clang-format on */ #endif // CONFIG_HIGHBITDEPTH #endif // CONFIG_AV1 && CONFIG_MOTION_VAR diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h index a4b334ea6..cbea55ca1 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h @@ -508,17 +508,19 @@ SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { to enforce that. */ #define v256_shl_n_byte(a, n) \ ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n), \ - v128_shr_n_byte(a.lo, 16 - (n))), \ + v128_shr_n_byte(a.lo, (16 - (n)) & 31)), \ v128_shl_n_byte(a.lo, (n))) \ - : v256_from_v128((n) > 16 ? v128_shl_n_byte(a.lo, (n)-16) : a.lo, \ - v128_zero())) - -#define v256_shr_n_byte(a, n) \ - ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n), \ - v128_or(v128_shr_n_byte(a.lo, n), \ - v128_shl_n_byte(a.hi, 16 - (n)))) \ - : v256_from_v128(v128_zero(), \ - (n) > 16 ? v128_shr_n_byte(a.hi, (n)-16) : a.hi)) + : v256_from_v128( \ + (n) > 16 ? v128_shl_n_byte(a.lo, ((n)-16) & 31) : a.lo, \ + v128_zero())) + +#define v256_shr_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n), \ + v128_or(v128_shr_n_byte(a.lo, n), \ + v128_shl_n_byte(a.hi, (16 - (n)) & 31))) \ + : v256_from_v128( \ + v128_zero(), \ + (n) > 16 ? v128_shr_n_byte(a.hi, ((n)-16) & 31) : a.hi)) #define v256_align(a, b, c) \ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h index a5e964aad..01732ae64 100644 --- a/third_party/aom/aom_dsp/txfm_common.h +++ b/third_party/aom/aom_dsp/txfm_common.h @@ -21,6 +21,27 @@ #define UNIT_QUANT_SHIFT 2 #define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT) +typedef struct txfm_param { + // for both forward and inverse transforms + int tx_type; + int tx_size; + int lossless; + int bd; +#if CONFIG_MRC_TX || CONFIG_LGT + int stride; + uint8_t *dst; +#endif // CONFIG_MRC_TX || CONFIG_LGT +#if CONFIG_LGT + int is_inter; + int mode; +#endif +// for inverse transforms only +#if CONFIG_ADAPT_SCAN + const int16_t *eob_threshold; +#endif + int eob; +} TxfmParam; + // Constants: // for (int i = 1; i< 32; ++i) // printf("static const int cospi_%d_64 = %.0f;\n", i, @@ -67,4 +88,62 @@ static const tran_high_t sinpi_4_9 = 15212; // 16384 * sqrt(2) static const tran_high_t Sqrt2 = 23170; +static INLINE tran_high_t fdct_round_shift(tran_high_t input) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return rv; +} + +#if CONFIG_LGT +// The Line Graph Transforms (LGTs) matrices are written as follows. +// Each 2D array is 16384 times an LGT matrix, which is the matrix of +// eigenvectors of the graph Laplacian matrices for the line graph. + +// LGT4 name: lgt4_140 +// Self loops: 1.400, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000 +static const tran_high_t lgt4_140[4][4] = { + { 4206, 9518, 13524, 15674 }, + { 11552, 14833, 1560, -13453 }, + { 15391, -1906, -14393, 9445 }, + { 12201, -14921, 12016, -4581 }, +}; + +// LGT4 name: lgt4_170 +// Self loops: 1.700, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000 +static const tran_high_t lgt4_170[4][4] = { + { 3636, 9287, 13584, 15902 }, + { 10255, 15563, 2470, -13543 }, + { 14786, 711, -15249, 9231 }, + { 14138, -14420, 10663, -3920 }, +}; + +// LGT8 name: lgt8_150 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150[8][8] = { + { 2075, 5110, 7958, 10511, 12677, 14376, 15544, 16140 }, + { 6114, 13307, 16196, 13845, 7015, -2084, -10509, -15534 }, + { 9816, 16163, 8717, -6168, -15790, -11936, 2104, 14348 }, + { 12928, 12326, -7340, -15653, 242, 15763, 6905, -12632 }, + { 15124, 3038, -16033, 1758, 15507, -6397, -13593, 10463 }, + { 15895, -7947, -7947, 15895, -7947, -7947, 15895, -7947 }, + { 14325, -15057, 9030, 1050, -10659, 15483, -13358, 5236 }, + { 9054, -12580, 14714, -15220, 14043, -11312, 7330, -2537 }, +}; + +// LGT8 name: lgt8_170 +// Self loops: 1.700, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_170[8][8] = { + { 1858, 4947, 7850, 10458, 12672, 14411, 15607, 16217 }, + { 5494, 13022, 16256, 14129, 7343, -1864, -10456, -15601 }, + { 8887, 16266, 9500, -5529, -15749, -12273, 1876, 14394 }, + { 11870, 13351, -6199, -15984, -590, 15733, 7273, -12644 }, + { 14248, 5137, -15991, 291, 15893, -5685, -13963, 10425 }, + { 15716, -5450, -10010, 15929, -6665, -8952, 16036, -7835 }, + { 15533, -13869, 6559, 3421, -12009, 15707, -13011, 5018 }, + { 11357, -13726, 14841, -14600, 13025, -10259, 6556, -2254 }, +}; +#endif // CONFIG_LGT #endif // AOM_DSP_TXFM_COMMON_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c index 79677c92f..a4c3616e7 100644 --- a/third_party/aom/aom_dsp/variance.c +++ b/third_party/aom/aom_dsp/variance.c @@ -9,6 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include +#include +#include #include "./aom_config.h" #include "./aom_dsp_rtcd.h" @@ -20,6 +22,9 @@ #include "aom_dsp/aom_filter.h" #include "aom_dsp/blend.h" +#include "./av1_rtcd.h" +#include "av1/common/filter.h" + uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride) { int distortion = 0; @@ -246,6 +251,13 @@ VARIANCES(4, 2) VARIANCES(2, 4) VARIANCES(2, 2) +#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES +VARIANCES(4, 16) +VARIANCES(16, 4) +VARIANCES(8, 32) +VARIANCES(32, 8) +#endif + GET_VAR(16, 16) GET_VAR(8, 8) @@ -271,33 +283,66 @@ void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, // Get pred block from up-sampled reference. void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height, - const uint8_t *ref, int ref_stride) { - int i, j, k; - int stride = ref_stride << 3; - - for (i = 0; i < height; i++) { - for (j = 0, k = 0; j < width; j++, k += 8) { - comp_pred[j] = ref[k]; + int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride) { + if (!subpel_x_q3 && !subpel_y_q3) { + int i; + for (i = 0; i < height; i++) { + memcpy(comp_pred, ref, width * sizeof(*comp_pred)); + comp_pred += width; + ref += ref_stride; + } + } else { + InterpFilterParams filter; + filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); + if (!subpel_y_q3) { + const int16_t *kernel; + kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + /*Directly call C version to allow this to work for small (2x2) sizes.*/ + aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL, + -1, width, height); + } else if (!subpel_x_q3) { + const int16_t *kernel; + kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + /*Directly call C version to allow this to work for small (2x2) sizes.*/ + aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel, + 16, width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *kernel_x; + const int16_t *kernel_y; + int intermediate_height; + kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + /*Directly call C versions to allow this to work for small (2x2) sizes.*/ + aom_convolve8_horiz_c(ref - ref_stride * ((filter.taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, + -1, width, intermediate_height); + aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), + MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, + 16, width, height); } - comp_pred += width; - ref += stride; } } void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, const uint8_t *ref, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, int ref_stride) { int i, j; - int stride = ref_stride << 3; + aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, + ref_stride); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { - const int tmp = ref[(j << 3)] + pred[j]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1); } comp_pred += width; pred += width; - ref += stride; } } @@ -611,6 +656,13 @@ HIGHBD_VARIANCES(4, 2) HIGHBD_VARIANCES(2, 4) HIGHBD_VARIANCES(2, 2) +#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES +HIGHBD_VARIANCES(4, 16) +HIGHBD_VARIANCES(16, 4) +HIGHBD_VARIANCES(8, 32) +HIGHBD_VARIANCES(32, 8) +#endif + HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) @@ -637,37 +689,74 @@ void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, } void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height, - const uint8_t *ref8, int ref_stride) { - int i, j; - int stride = ref_stride << 3; - - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - comp_pred[j] = ref[(j << 3)]; + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, int bd) { + if (!subpel_x_q3 && !subpel_y_q3) { + const uint16_t *ref; + int i; + ref = CONVERT_TO_SHORTPTR(ref8); + for (i = 0; i < height; i++) { + memcpy(comp_pred, ref, width * sizeof(*comp_pred)); + comp_pred += width; + ref += ref_stride; + } + } else { + InterpFilterParams filter; + filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); + if (!subpel_y_q3) { + const int16_t *kernel; + kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + /*Directly call C version to allow this to work for small (2x2) sizes.*/ + aom_highbd_convolve8_horiz_c(ref8, ref_stride, + CONVERT_TO_BYTEPTR(comp_pred), width, kernel, + 16, NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *kernel; + kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + /*Directly call C version to allow this to work for small (2x2) sizes.*/ + aom_highbd_convolve8_vert_c(ref8, ref_stride, + CONVERT_TO_BYTEPTR(comp_pred), width, NULL, + -1, kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *kernel_x; + const int16_t *kernel_y; + int intermediate_height; + kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + /*Directly call C versions to allow this to work for small (2x2) sizes.*/ + aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter.taps >> 1) - 1), + ref_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert_c( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), + MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, + 16, width, height, bd); } - comp_pred += width; - ref += stride; } } void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, - int height, const uint8_t *ref8, - int ref_stride) { + int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd) { int i, j; - int stride = ref_stride << 3; - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, + ref8, ref_stride, bd); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { - const int tmp = pred[j] + ref[(j << 3)]; - comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); + comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1); } comp_pred += width; pred += width; - ref += stride; } } #endif // CONFIG_HIGHBITDEPTH @@ -694,22 +783,23 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, } void aom_comp_mask_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, const uint8_t *ref, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i, j; - int stride = ref_stride << 3; + aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, + ref_stride); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { if (!invert_mask) - comp_pred[j] = AOM_BLEND_A64(mask[j], ref[(j << 3)], pred[j]); + comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]); else - comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[(j << 3)]); + comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]); } comp_pred += width; pred += width; - ref += stride; mask += mask_stride; } } @@ -753,6 +843,13 @@ MASK_SUBPIX_VAR(128, 64) MASK_SUBPIX_VAR(128, 128) #endif // CONFIG_EXT_PARTITION +#if CONFIG_EXT_PARTITION_TYPES +MASK_SUBPIX_VAR(4, 16) +MASK_SUBPIX_VAR(16, 4) +MASK_SUBPIX_VAR(8, 32) +MASK_SUBPIX_VAR(32, 8) +#endif + #if CONFIG_HIGHBITDEPTH void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, @@ -775,26 +872,24 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, } } -void aom_highbd_comp_mask_upsampled_pred_c(uint16_t *comp_pred, - const uint8_t *pred8, int width, - int height, const uint8_t *ref8, - int ref_stride, const uint8_t *mask, - int mask_stride, int invert_mask) { +void aom_highbd_comp_mask_upsampled_pred_c( + uint16_t *comp_pred, const uint8_t *pred8, int width, int height, + int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, + const uint8_t *mask, int mask_stride, int invert_mask, int bd) { int i, j; - int stride = ref_stride << 3; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, + ref8, ref_stride, bd); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { if (!invert_mask) - comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j << 3], pred[j]); + comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]); else - comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j << 3]); + comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]); } comp_pred += width; pred += width; - ref += stride; mask += mask_stride; } } @@ -884,6 +979,13 @@ HIGHBD_MASK_SUBPIX_VAR(64, 128) HIGHBD_MASK_SUBPIX_VAR(128, 64) HIGHBD_MASK_SUBPIX_VAR(128, 128) #endif // CONFIG_EXT_PARTITION + +#if CONFIG_EXT_PARTITION_TYPES +HIGHBD_MASK_SUBPIX_VAR(4, 16) +HIGHBD_MASK_SUBPIX_VAR(16, 4) +HIGHBD_MASK_SUBPIX_VAR(8, 32) +HIGHBD_MASK_SUBPIX_VAR(32, 8) +#endif #endif // CONFIG_HIGHBITDEPTH #endif // CONFIG_AV1 && CONFIG_EXT_INTER @@ -983,6 +1085,17 @@ OBMC_VAR(128, 128) OBMC_SUBPIX_VAR(128, 128) #endif // CONFIG_EXT_PARTITION +#if CONFIG_EXT_PARTITION_TYPES +OBMC_VAR(4, 16) +OBMC_SUBPIX_VAR(4, 16) +OBMC_VAR(16, 4) +OBMC_SUBPIX_VAR(16, 4) +OBMC_VAR(8, 32) +OBMC_SUBPIX_VAR(8, 32) +OBMC_VAR(32, 8) +OBMC_SUBPIX_VAR(32, 8) +#endif + #if CONFIG_HIGHBITDEPTH static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, @@ -1164,5 +1277,16 @@ HIGHBD_OBMC_SUBPIX_VAR(128, 64) HIGHBD_OBMC_VAR(128, 128) HIGHBD_OBMC_SUBPIX_VAR(128, 128) #endif // CONFIG_EXT_PARTITION + +#if CONFIG_EXT_PARTITION_TYPES +HIGHBD_OBMC_VAR(4, 16) +HIGHBD_OBMC_SUBPIX_VAR(4, 16) +HIGHBD_OBMC_VAR(16, 4) +HIGHBD_OBMC_SUBPIX_VAR(16, 4) +HIGHBD_OBMC_VAR(8, 32) +HIGHBD_OBMC_SUBPIX_VAR(8, 32) +HIGHBD_OBMC_VAR(32, 8) +HIGHBD_OBMC_SUBPIX_VAR(32, 8) +#endif #endif // CONFIG_HIGHBITDEPTH #endif // CONFIG_AV1 && CONFIG_MOTION_VAR diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c index a337e618d..657dcfa22 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c @@ -85,147 +85,6 @@ void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); } -void aom_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, - int stride) { - __m128i in0, in1, in2, in3; - __m128i u0, u1; - __m128i sum = _mm_setzero_si128(); - int i; - - for (i = 0; i < 2; ++i) { - in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0)); - in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8)); - - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0)); - in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8)); - - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0)); - in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8)); - - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0)); - in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8)); - - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - sum = _mm_add_epi16(sum, u1); - input += 8 * stride; - } - - u0 = _mm_setzero_si128(); - in0 = _mm_unpacklo_epi16(u0, sum); - in1 = _mm_unpackhi_epi16(u0, sum); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(sum, u0); - in1 = _mm_unpackhi_epi32(sum, u0); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(sum, 8); - - in1 = _mm_add_epi32(sum, in0); - in1 = _mm_srai_epi32(in1, 1); - output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); -} - -void aom_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, - int stride) { - __m128i in0, in1, in2, in3; - __m128i u0, u1; - __m128i sum = _mm_setzero_si128(); - int i; - - for (i = 0; i < 8; ++i) { - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - in0 = _mm_load_si128((const __m128i *)(input + 0)); - in1 = _mm_load_si128((const __m128i *)(input + 8)); - in2 = _mm_load_si128((const __m128i *)(input + 16)); - in3 = _mm_load_si128((const __m128i *)(input + 24)); - - input += stride; - sum = _mm_add_epi16(sum, u1); - u0 = _mm_add_epi16(in0, in1); - u1 = _mm_add_epi16(in2, in3); - sum = _mm_add_epi16(sum, u0); - - sum = _mm_add_epi16(sum, u1); - } - - u0 = _mm_setzero_si128(); - in0 = _mm_unpacklo_epi16(u0, sum); - in1 = _mm_unpackhi_epi16(u0, sum); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(sum, u0); - in1 = _mm_unpackhi_epi32(sum, u0); - - sum = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(sum, 8); - - in1 = _mm_add_epi32(sum, in0); - in1 = _mm_srai_epi32(in1, 3); - output[0] = (tran_low_t)_mm_cvtsi128_si32(in1); -} - #define DCT_HIGH_BIT_DEPTH 0 #define FDCT4x4_2D aom_fdct4x4_sse2 #define FDCT8x8_2D aom_fdct8x8_sse2 diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c new file mode 100644 index 000000000..2bbf15ef2 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom/aom_integer.h" + +static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { + const __m128i sign = _mm_srai_epi16(*p, 15); + const __m128i dc = _mm_unpacklo_epi16(*p, sign); + const __m128i ac = _mm_unpackhi_epi16(*p, sign); + *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); +} + +static INLINE void update_qp(__m256i *qp) { + int i; + for (i = 0; i < 5; ++i) { + qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11); + } +} + +static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *dequant_ptr, + const int16_t *quant_shift_ptr, __m256i *qp) { + const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr); + const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr); + const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr); + init_one_qp(&zbin, &qp[0]); + init_one_qp(&round, &qp[1]); + init_one_qp(&quant, &qp[2]); + init_one_qp(&dequant, &qp[3]); + init_one_qp(&quant_shift, &qp[4]); +} + +// Note: +// *x is vector multiplied by *y which is 16 int32_t parallel multiplication +// and right shift 16. The output, 16 int32_t is save in *p. +static INLINE void mm256_mul_shift_epi32(const __m256i *x, const __m256i *y, + __m256i *p) { + __m256i prod_lo = _mm256_mul_epi32(*x, *y); + __m256i prod_hi = _mm256_srli_epi64(*x, 32); + const __m256i mult_hi = _mm256_srli_epi64(*y, 32); + prod_hi = _mm256_mul_epi32(prod_hi, mult_hi); + + prod_lo = _mm256_srli_epi64(prod_lo, 16); + const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1); + prod_lo = _mm256_and_si256(prod_lo, mask); + prod_hi = _mm256_srli_epi64(prod_hi, 16); + + prod_hi = _mm256_slli_epi64(prod_hi, 32); + *p = _mm256_or_si256(prod_lo, prod_hi); +} + +static INLINE void quantize(const __m256i *qp, __m256i *c, + const int16_t *iscan_ptr, tran_low_t *qcoeff, + tran_low_t *dqcoeff, __m256i *eob) { + const __m256i abs = _mm256_abs_epi32(*c); + const __m256i flag1 = _mm256_cmpgt_epi32(abs, qp[0]); + __m256i flag2 = _mm256_cmpeq_epi32(abs, qp[0]); + flag2 = _mm256_or_si256(flag1, flag2); + const int32_t nzflag = _mm256_movemask_epi8(flag2); + + if (LIKELY(nzflag)) { + __m256i q = _mm256_add_epi32(abs, qp[1]); + __m256i tmp; + mm256_mul_shift_epi32(&q, &qp[2], &tmp); + q = _mm256_add_epi32(tmp, q); + + mm256_mul_shift_epi32(&q, &qp[4], &q); + __m256i dq = _mm256_mullo_epi32(q, qp[3]); + + q = _mm256_sign_epi32(q, *c); + dq = _mm256_sign_epi32(dq, *c); + q = _mm256_and_si256(q, flag2); + dq = _mm256_and_si256(dq, flag2); + + _mm256_storeu_si256((__m256i *)qcoeff, q); + _mm256_storeu_si256((__m256i *)dqcoeff, dq); + + const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); + const __m128i zr = _mm_setzero_si128(); + const __m128i lo = _mm_unpacklo_epi16(isc, zr); + const __m128i hi = _mm_unpackhi_epi16(isc, zr); + const __m256i iscan = + _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); + + const __m256i zero = _mm256_setzero_si256(); + const __m256i zc = _mm256_cmpeq_epi32(dq, zero); + const __m256i nz = _mm256_cmpeq_epi32(zc, zero); + __m256i cur_eob = _mm256_sub_epi32(iscan, nz); + cur_eob = _mm256_and_si256(cur_eob, nz); + *eob = _mm256_max_epi32(cur_eob, *eob); + } else { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)dqcoeff, zero); + } +} + +void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + (void)scan; + const unsigned int step = 8; + + if (LIKELY(!skip_block)) { + __m256i qp[5], coeff; + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); + + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); + } + } else { + do { + const __m256i zero = _mm256_setzero_si256(); + _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero); + _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero); + qcoeff_ptr += step; + dqcoeff_ptr += step; + n_coeffs -= step; + } while (n_coeffs > 0); + *eob_ptr = 0; + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c index 3ee24ab16..5570ca5b7 100644 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -15,7 +15,6 @@ #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" -#if CONFIG_HIGHBITDEPTH void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -152,4 +151,3 @@ void aom_highbd_quantize_b_32x32_sse2( } *eob_ptr = eob + 1; } -#endif diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm index 0c7cb3998..9c3bbdd69 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -288,3 +288,9 @@ HIGH_SADNXN4D 8, 8 HIGH_SADNXN4D 8, 4 HIGH_SADNXN4D 4, 8 HIGH_SADNXN4D 4, 4 +%if CONFIG_EXT_PARTITION_TYPES +HIGH_SADNXN4D 4, 16 +HIGH_SADNXN4D 16, 4 +HIGH_SADNXN4D 8, 32 +HIGH_SADNXN4D 32, 8 +%endif diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm index 8427b891c..248b98ef5 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm @@ -227,6 +227,10 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 +%if CONFIG_EXT_PARTITION_TYPES +HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2 +HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2 +%endif ; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -295,7 +299,10 @@ HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 - +%if CONFIG_EXT_PARTITION_TYPES +HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 +HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 +%endif ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -364,3 +371,7 @@ HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 +%if CONFIG_EXT_PARTITION_TYPES +HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2 +HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2 +%endif diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm index 797e9c1d4..ee19796e3 100644 --- a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -75,7 +75,7 @@ SECTION .text paddd m6, m4 mov r1, ssem ; r1 = unsigned int *sse movd [r1], m7 ; store sse - movd rax, m6 ; store sum as return value + movd eax, m6 ; store sum as return value %endif RET %endmacro diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index 29f96ce24..93923ffb0 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include #include // SSE2 #include "./aom_config.h" @@ -16,6 +17,9 @@ #include "aom_ports/mem.h" +#include "./av1_rtcd.h" +#include "av1/common/filter.h" + typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, uint32_t *sse, int *sum); @@ -181,6 +185,11 @@ VAR_FN(16, 16, 16, 8); VAR_FN(16, 8, 8, 7); VAR_FN(8, 16, 8, 7); VAR_FN(8, 8, 8, 6); +#if CONFIG_EXT_PARTITION_TYPES +VAR_FN(16, 4, 16, 6); +VAR_FN(8, 32, 8, 8); +VAR_FN(32, 8, 16, 8); +#endif #undef VAR_FN @@ -387,6 +396,7 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } +#if CONFIG_EXT_PARTITION_TYPES #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ FN(64, 32, 16, 6, 5, opt, (int64_t)); \ @@ -398,7 +408,24 @@ DECLS(sse2); FN(16, 8, 16, 4, 3, opt, (int64_t)); \ FN(8, 16, 8, 3, 4, opt, (int64_t)); \ FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)); + FN(8, 4, 8, 3, 2, opt, (int64_t)); \ + FN(16, 4, 16, 4, 2, opt, (int64_t)); \ + FN(8, 32, 8, 3, 5, opt, (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (int64_t)) +#else +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)) +#endif FNS(sse2); @@ -412,9 +439,9 @@ FNS(sse2); const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ void *unused); -#define DECLS(opt1) \ - DECL(16, opt1) \ - DECL(8, opt1) +#define DECLS(opt) \ + DECL(16, opt) \ + DECL(8, opt) DECLS(sse2); #undef DECL @@ -546,18 +573,36 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#define FNS(opt1) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ - FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt1, (int64_t)); +#if CONFIG_EXT_PARTITION_TYPES +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); \ + FN(16, 4, 16, 4, 2, opt, (int64_t)); \ + FN(8, 32, 8, 3, 5, opt, (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (int64_t)); +#else +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); +#endif FNS(sse2); @@ -565,131 +610,94 @@ FNS(sse2); #undef FN void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height, - const uint8_t *ref8, int ref_stride) { - int i, j; - int stride = ref_stride << 3; - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - - if (width >= 8) { - // read 8 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 8) { - __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); - __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); - __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); - __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); - __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); - __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); - __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); - __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); - __m128i t0, t1, t2, t3; - - t0 = _mm_unpacklo_epi16(s0, s1); - t1 = _mm_unpacklo_epi16(s2, s3); - t2 = _mm_unpacklo_epi16(s4, s5); - t3 = _mm_unpacklo_epi16(s6, s7); - t0 = _mm_unpacklo_epi32(t0, t1); - t2 = _mm_unpacklo_epi32(t2, t3); - t0 = _mm_unpacklo_epi64(t0, t2); - - _mm_storeu_si128((__m128i *)(comp_pred), t0); + int subpel_x_q3, int subpel_y_q3, + const uint8_t *ref8, int ref_stride, + int bd) { + if (!subpel_x_q3 && !subpel_y_q3) { + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + if (width >= 8) { + int i; + assert(!(width & 7)); + /*Read 8 pixels one row at a time.*/ + for (i = 0; i < height; i++) { + int j; + for (j = 0; j < width; j += 8) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + _mm_storeu_si128((__m128i *)comp_pred, s0); + comp_pred += 8; + ref += 8; + } + ref += ref_stride - width; + } + } else { + int i; + assert(!(width & 3)); + /*Read 4 pixels two rows at a time.*/ + for (i = 0; i < height; i += 2) { + __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); + __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); + __m128i t0 = _mm_unpacklo_epi64(s0, s1); + _mm_storeu_si128((__m128i *)comp_pred, t0); comp_pred += 8; - ref += 64; // 8 * 8; + ref += 2 * ref_stride; } - ref += stride - (width << 3); } } else { - // read 4 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 4) { - __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); - __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); - __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); - __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); - __m128i t0, t1; - - t0 = _mm_unpacklo_epi16(s0, s1); - t1 = _mm_unpacklo_epi16(s2, s3); - t0 = _mm_unpacklo_epi32(t0, t1); - - _mm_storel_epi64((__m128i *)(comp_pred), t0); - comp_pred += 4; - ref += 4 * 8; - } - ref += stride - (width << 3); + InterpFilterParams filter; + filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); + if (!subpel_y_q3) { + const int16_t *kernel; + kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz(ref8, ref_stride, + CONVERT_TO_BYTEPTR(comp_pred), width, kernel, + 16, NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *kernel; + kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), + width, NULL, -1, kernel, 16, width, height, bd); + } else { + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *kernel_x; + const int16_t *kernel_y; + int intermediate_height; + kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), + ref_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), + MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, + 16, width, height, bd); } } } void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, const uint8_t *pred8, int width, - int height, const uint8_t *ref8, - int ref_stride) { - const __m128i one = _mm_set1_epi16(1); - int i, j; - int stride = ref_stride << 3; + int height, int subpel_x_q3, + int subpel_y_q3, + const uint8_t *ref8, + int ref_stride, int bd) { uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); - - if (width >= 8) { - // read 8 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 8) { - __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); - __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); - __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); - __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); - __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); - __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); - __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); - __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); - __m128i p0 = _mm_loadu_si128((const __m128i *)pred); - __m128i t0, t1, t2, t3; - - t0 = _mm_unpacklo_epi16(s0, s1); - t1 = _mm_unpacklo_epi16(s2, s3); - t2 = _mm_unpacklo_epi16(s4, s5); - t3 = _mm_unpacklo_epi16(s6, s7); - t0 = _mm_unpacklo_epi32(t0, t1); - t2 = _mm_unpacklo_epi32(t2, t3); - t0 = _mm_unpacklo_epi64(t0, t2); - - p0 = _mm_adds_epu16(t0, p0); - p0 = _mm_adds_epu16(p0, one); - p0 = _mm_srli_epi16(p0, 1); - - _mm_storeu_si128((__m128i *)(comp_pred), p0); - comp_pred += 8; - pred += 8; - ref += 8 * 8; - } - ref += stride - (width << 3); - } - } else { - // read 4 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 4) { - __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); - __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); - __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); - __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); - __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); - __m128i t0, t1; - - t0 = _mm_unpacklo_epi16(s0, s1); - t1 = _mm_unpacklo_epi16(s2, s3); - t0 = _mm_unpacklo_epi32(t0, t1); - - p0 = _mm_adds_epu16(t0, p0); - p0 = _mm_adds_epu16(p0, one); - p0 = _mm_srli_epi16(p0, 1); - - _mm_storel_epi64((__m128i *)(comp_pred), p0); - comp_pred += 4; - pred += 4; - ref += 4 * 8; - } - ref += stride - (width << 3); - } + int n; + int i; + aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, + ref8, ref_stride, bd); + /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ + assert(!(width * height & 7)); + n = width * height >> 3; + for (i = 0; i < n; i++) { + __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu16(s0, p0)); + comp_pred += 8; + pred += 8; } } diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c index be200df4c..86ce928b7 100644 --- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c +++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c @@ -3498,237 +3498,3 @@ void idct32_8col(__m128i *in0, __m128i *in1) { in1[14] = _mm_sub_epi16(stp1_1, stp1_30); in1[15] = _mm_sub_epi16(stp1_0, stp1_31); } - -#if CONFIG_HIGHBITDEPTH -static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { - __m128i ubounded, retval; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); - ubounded = _mm_cmpgt_epi16(value, max); - retval = _mm_andnot_si128(ubounded, value); - ubounded = _mm_and_si128(ubounded, max); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero)); - return retval; -} - -void aom_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - __m128i inptr[4]; - __m128i sign_bits[2]; - __m128i temp_mm, min_input, max_input; - int test; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - int optimised_cols = 0; - const __m128i zero = _mm_set1_epi16(0); - const __m128i eight = _mm_set1_epi16(8); - const __m128i max = _mm_set1_epi16(12043); - const __m128i min = _mm_set1_epi16(-12043); - // Load input into __m128i - inptr[0] = _mm_loadu_si128((const __m128i *)input); - inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4)); - inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8)); - inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12)); - - // Pack to 16 bits - inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]); - inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]); - - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (!test) { - // Do the row transform - aom_idct4_sse2(inptr); - - // Check the min & max values - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp_mm = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp_mm); - - if (test) { - array_transpose_4x4(inptr); - sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero); - sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero); - inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]); - inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]); - inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]); - inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]); - _mm_storeu_si128((__m128i *)outptr, inptr[0]); - _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]); - _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]); - _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]); - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - aom_highbd_idct4_c(input, outptr, bd); - input += 4; - outptr += 4; - } - } - - if (optimised_cols) { - aom_idct4_sse2(inptr); - - // Final round and shift - inptr[0] = _mm_add_epi16(inptr[0], eight); - inptr[1] = _mm_add_epi16(inptr[1], eight); - - inptr[0] = _mm_srai_epi16(inptr[0], 4); - inptr[1] = _mm_srai_epi16(inptr[1], 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_loadl_epi64((const __m128i *)dest); - __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi64( - d0, _mm_loadl_epi64((const __m128i *)(dest + stride))); - d2 = _mm_unpacklo_epi64( - d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3))); - d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd); - d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd); - // store input0 - _mm_storel_epi64((__m128i *)dest, d0); - // store input1 - d0 = _mm_srli_si128(d0, 8); - _mm_storel_epi64((__m128i *)(dest + stride), d0); - // store input2 - _mm_storel_epi64((__m128i *)(dest + stride * 2), d2); - // store input3 - d2 = _mm_srli_si128(d2, 8); - _mm_storel_epi64((__m128i *)(dest + stride * 3), d2); - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[4], temp_out[4]; - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - aom_highbd_idct4_c(temp_in, temp_out, bd); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); - } - } - } -} - -void aom_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j, test; - __m128i inptr[8]; - __m128i min_input, max_input, temp1, temp2, sign_bits; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - const __m128i zero = _mm_set1_epi16(0); - const __m128i sixteen = _mm_set1_epi16(16); - const __m128i max = _mm_set1_epi16(6201); - const __m128i min = _mm_set1_epi16(-6201); - int optimised_cols = 0; - - // Load input into __m128i & pack to 16 bits - for (i = 0; i < 8; i++) { - temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i)); - temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4)); - inptr[i] = _mm_packs_epi32(temp1, temp2); - } - - // Find the min & max for the row transform - // only first 4 row has non-zero coefs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 4; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (!test) { - // Do the row transform - aom_idct8_sse2(inptr); - - // Find the min & max for the column transform - // N.B. Only first 4 cols contain non-zero coeffs - max_input = _mm_max_epi16(inptr[0], inptr[1]); - min_input = _mm_min_epi16(inptr[0], inptr[1]); - for (i = 2; i < 8; i++) { - max_input = _mm_max_epi16(max_input, inptr[i]); - min_input = _mm_min_epi16(min_input, inptr[i]); - } - max_input = _mm_cmpgt_epi16(max_input, max); - min_input = _mm_cmplt_epi16(min_input, min); - temp1 = _mm_or_si128(max_input, min_input); - test = _mm_movemask_epi8(temp1); - - if (test) { - // Use fact only first 4 rows contain non-zero coeffs - array_transpose_4X8(inptr, inptr); - for (i = 0; i < 4; i++) { - sign_bits = _mm_cmplt_epi16(inptr[i], zero); - temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits); - temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1); - _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2); - } - } else { - // Set to use the optimised transform for the column - optimised_cols = 1; - } - } else { - // Run the un-optimised row transform - for (i = 0; i < 4; ++i) { - aom_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - } - - if (optimised_cols) { - aom_idct8_sse2(inptr); - - // Final round & shift and Reconstruction and Store - { - __m128i d[8]; - for (i = 0; i < 8; i++) { - inptr[i] = _mm_add_epi16(inptr[i], sixteen); - d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); - inptr[i] = _mm_srai_epi16(inptr[i], 5); - d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd); - // Store - _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]); - } - } - } else { - // Run the un-optimised column transform - tran_low_t temp_in[8], temp_out[8]; - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - aom_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } - } -} - -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c index 9d16a3e84..6a73ac460 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -93,6 +93,12 @@ MASKSAD8XN_SSSE3(8) MASKSAD8XN_SSSE3(4) MASKSAD4XN_SSSE3(8) MASKSAD4XN_SSSE3(4) +#if CONFIG_EXT_PARTITION_TYPES +MASKSAD4XN_SSSE3(16) +MASKSADMXN_SSSE3(16, 4) +MASKSAD8XN_SSSE3(32) +MASKSADMXN_SSSE3(32, 8) +#endif static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, @@ -283,6 +289,12 @@ HIGHBD_MASKSADMXN_SSSE3(8, 8) HIGHBD_MASKSADMXN_SSSE3(8, 4) HIGHBD_MASKSAD4XN_SSSE3(8) HIGHBD_MASKSAD4XN_SSSE3(4) +#if CONFIG_EXT_PARTITION_TYPES +HIGHBD_MASKSAD4XN_SSSE3(16) +HIGHBD_MASKSADMXN_SSSE3(16, 4) +HIGHBD_MASKSADMXN_SSSE3(8, 32) +HIGHBD_MASKSADMXN_SSSE3(32, 8) +#endif static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c index be9d437d2..24e7ed1c6 100644 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -126,6 +126,12 @@ MASK_SUBPIX_VAR8XH_SSSE3(8) MASK_SUBPIX_VAR8XH_SSSE3(4) MASK_SUBPIX_VAR4XH_SSSE3(8) MASK_SUBPIX_VAR4XH_SSSE3(4) +#if CONFIG_EXT_PARTITION_TYPES +MASK_SUBPIX_VAR4XH_SSSE3(16) +MASK_SUBPIX_VAR_SSSE3(16, 4) +MASK_SUBPIX_VAR8XH_SSSE3(32) +MASK_SUBPIX_VAR_SSSE3(32, 8) +#endif static INLINE __m128i filter_block(const __m128i a, const __m128i b, const __m128i filter) { @@ -564,6 +570,7 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ uint64_t sse64; \ int sum; \ + int64_t var; \ uint16_t temp[(H + 1) * W]; \ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ @@ -579,7 +586,8 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, msk_stride, W, H, &sse64, &sum); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \ sum = ROUND_POWER_OF_TWO(sum, 2); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } \ unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ @@ -587,6 +595,7 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ uint64_t sse64; \ int sum; \ + int64_t var; \ uint16_t temp[(H + 1) * W]; \ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ @@ -602,7 +611,8 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, msk_stride, W, H, &sse64, &sum); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \ sum = ROUND_POWER_OF_TWO(sum, 4); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } #define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \ @@ -634,6 +644,7 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ int sse_; \ int sum; \ + int64_t var; \ uint16_t temp[(H + 1) * 4]; \ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ @@ -649,7 +660,8 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, msk_stride, H, &sse_, &sum); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \ sum = ROUND_POWER_OF_TWO(sum, 2); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } \ unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \ @@ -657,6 +669,7 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \ int sse_; \ int sum; \ + int64_t var; \ uint16_t temp[(H + 1) * 4]; \ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ @@ -672,7 +685,8 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, msk_stride, H, &sse_, &sum); \ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \ sum = ROUND_POWER_OF_TWO(sum, 4); \ - return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \ + return (var >= 0) ? (uint32_t)var : 0; \ } #if CONFIG_EXT_PARTITION @@ -693,6 +707,12 @@ HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4) HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8) HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4) +#if CONFIG_EXT_PARTITION_TYPES +HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) +#endif static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, const __m128i filter) { diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c index 21632644f..3fd6f71e5 100644 --- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c @@ -137,6 +137,12 @@ OBMCSADWXH(8, 8) OBMCSADWXH(8, 4) OBMCSADWXH(4, 8) OBMCSADWXH(4, 4) +#if CONFIG_EXT_PARTITION_TYPES +OBMCSADWXH(4, 16) +OBMCSADWXH(16, 4) +OBMCSADWXH(8, 32) +OBMCSADWXH(32, 8) +#endif //////////////////////////////////////////////////////////////////////////////// // High bit-depth @@ -260,4 +266,10 @@ HBD_OBMCSADWXH(8, 8) HBD_OBMCSADWXH(8, 4) HBD_OBMCSADWXH(4, 8) HBD_OBMCSADWXH(4, 4) +#if CONFIG_EXT_PARTITION_TYPES +HBD_OBMCSADWXH(4, 16) +HBD_OBMCSADWXH(16, 4) +HBD_OBMCSADWXH(8, 32) +HBD_OBMCSADWXH(32, 8) +#endif #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 1797ded80..44cfa8e28 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -146,6 +146,12 @@ OBMCVARWXH(8, 8) OBMCVARWXH(8, 4) OBMCVARWXH(4, 8) OBMCVARWXH(4, 4) +#if CONFIG_EXT_PARTITION_TYPES +OBMCVARWXH(4, 16) +OBMCVARWXH(16, 4) +OBMCVARWXH(8, 32) +OBMCVARWXH(32, 8) +#endif //////////////////////////////////////////////////////////////////////////////// // High bit-depth @@ -353,4 +359,10 @@ HBD_OBMCVARWXH(8, 8) HBD_OBMCVARWXH(8, 4) HBD_OBMCVARWXH(4, 8) HBD_OBMCVARWXH(4, 4) +#if CONFIG_EXT_PARTITION_TYPES +HBD_OBMCVARWXH(4, 16) +HBD_OBMCVARWXH(16, 4) +HBD_OBMCVARWXH(8, 32) +HBD_OBMCVARWXH(32, 8) +#endif #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm index 8f04ef2f3..4570e2ce6 100644 --- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm @@ -251,3 +251,9 @@ SADNXN4D 8, 8 SADNXN4D 8, 4 SADNXN4D 4, 8 SADNXN4D 4, 4 +%if CONFIG_EXT_PARTITION_TYPES +SADNXN4D 4, 16 +SADNXN4D 16, 4 +SADNXN4D 8, 32 +SADNXN4D 32, 8 +%endif diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c index 196394379..e8dd87a26 100644 --- a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c +++ b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c @@ -704,7 +704,12 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, uint32_t *res) { __m256i u0, u1, u2, u3; +#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + const __m256i mask = _mm256_setr_epi32(UINT32_MAX, 0, UINT32_MAX, 0, + UINT32_MAX, 0, UINT32_MAX, 0); +#else const __m256i mask = _mm256_set1_epi64x(UINT32_MAX); +#endif __m128i sad; // 8 32-bit summation diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm index e45457a57..88d427077 100644 --- a/third_party/aom/aom_dsp/x86/sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm @@ -208,6 +208,10 @@ SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 +%if CONFIG_EXT_PARTITION_TYPES +SAD32XN 8 ; sad_32x8_sse2 +SAD32XN 8, 1 ; sad_32x8_avg_sse2 +%endif ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -254,6 +258,10 @@ SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 +%if CONFIG_EXT_PARTITION_TYPES +SAD16XN 4 ; sad_16x4_sse2 +SAD16XN 4, 1 ; sad_16x4_avg_sse2 +%endif ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -298,6 +306,10 @@ SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 +%if CONFIG_EXT_PARTITION_TYPES +SAD8XN 32 ; sad_8x32_sse2 +SAD8XN 32, 1 ; sad_8x32_avg_sse2 +%endif ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -343,3 +355,7 @@ SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse SAD4XN 4, 1 ; sad4x4_avg_sse +%if CONFIG_EXT_PARTITION_TYPES +SAD4XN 16 ; sad_4x16_sse2 +SAD4XN 16, 1 ; sad_4x16_avg_sse2 +%endif diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index d9563aa7f..918844185 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -17,6 +17,9 @@ #include "aom_ports/mem.h" +#include "./av1_rtcd.h" +#include "av1/common/filter.h" + typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride, const unsigned char *ref, int ref_stride, unsigned int *sse, int *sum); @@ -335,6 +338,52 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, return *sse; } +#if CONFIG_EXT_PARTITION_TYPES +unsigned int aom_variance4x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 4, 16, sse, &sum, + get4x4var_sse2, 4); + assert(sum <= 255 * 4 * 16); + assert(sum >= -255 * 4 * 16); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 6); +} + +unsigned int aom_variance16x4_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 4, sse, &sum, + get4x4var_sse2, 4); + assert(sum <= 255 * 16 * 4); + assert(sum >= -255 * 16 * 4); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 6); +} + +unsigned int aom_variance8x32_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 8, 32, sse, &sum, + aom_get8x8var_sse2, 8); + assert(sum <= 255 * 8 * 32); + assert(sum >= -255 * 8 * 32); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); +} + +unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 32, 8, sse, &sum, + aom_get8x8var_sse2, 8); + assert(sum <= 255 * 32 * 8); + assert(sum >= -255 * 32 * 8); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); +} +#endif + // The 2 unused parameters are place holders for PIC enabled build. // These definitions are for functions defined in subpel_variance.asm #define DECL(w, opt) \ @@ -342,13 +391,13 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \ void *unused0, void *unused) -#define DECLS(opt1, opt2) \ - DECL(4, opt1); \ - DECL(8, opt1); \ - DECL(16, opt1) +#define DECLS(opt) \ + DECL(4, opt); \ + DECL(8, opt); \ + DECL(16, opt) -DECLS(sse2, sse2); -DECLS(ssse3, ssse3); +DECLS(sse2); +DECLS(ssse3); #undef DECLS #undef DECL @@ -384,23 +433,44 @@ DECLS(ssse3, ssse3); return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#define FNS(opt1, opt2) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t)) - -FNS(sse2, sse2); -FNS(ssse3, ssse3); +#if CONFIG_EXT_PARTITION_TYPES +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ + FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)) +#else +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) +#endif + +FNS(sse2); +FNS(ssse3); #undef FNS #undef FN @@ -412,13 +482,13 @@ FNS(ssse3, ssse3); const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ void *unused) -#define DECLS(opt1, opt2) \ - DECL(4, opt1); \ - DECL(8, opt1); \ - DECL(16, opt1) +#define DECLS(opt) \ + DECL(4, opt); \ + DECL(8, opt); \ + DECL(16, opt) -DECLS(sse2, sse2); -DECLS(ssse3, ssse3); +DECLS(sse2); +DECLS(ssse3); #undef DECL #undef DECLS @@ -455,236 +525,149 @@ DECLS(ssse3, ssse3); return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#define FNS(opt1, opt2) \ - FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t)) - -FNS(sse2, sse); -FNS(ssse3, ssse3); +#if CONFIG_EXT_PARTITION_TYPES +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ + FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)) +#else +#define FNS(opt) \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) +#endif + +FNS(sse2); +FNS(ssse3); #undef FNS #undef FN void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, + int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride) { - int i, j; - int stride = ref_stride << 3; - - if (width >= 16) { - // read 16 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 16) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); - __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); - __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); - __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); - __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); - __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); - __m128i t0, t1, t2, t3; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - t1 = _mm_unpacklo_epi8(s2, s3); - s3 = _mm_unpackhi_epi8(s2, s3); - t2 = _mm_unpacklo_epi8(s4, s5); - s5 = _mm_unpackhi_epi8(s4, s5); - t3 = _mm_unpacklo_epi8(s6, s7); - s7 = _mm_unpackhi_epi8(s6, s7); - - s0 = _mm_unpacklo_epi8(t0, s1); - s2 = _mm_unpacklo_epi8(t1, s3); - s4 = _mm_unpacklo_epi8(t2, s5); - s6 = _mm_unpacklo_epi8(t3, s7); - s0 = _mm_unpacklo_epi32(s0, s2); - s4 = _mm_unpacklo_epi32(s4, s6); - s0 = _mm_unpacklo_epi64(s0, s4); - - _mm_storeu_si128((__m128i *)(comp_pred), s0); + if (!subpel_x_q3 && !subpel_y_q3) { + if (width >= 16) { + int i; + assert(!(width & 15)); + /*Read 16 pixels one row at a time.*/ + for (i = 0; i < height; i++) { + int j; + for (j = 0; j < width; j += 16) { + __m128i s0 = _mm_loadu_si128((const __m128i *)ref); + _mm_storeu_si128((__m128i *)comp_pred, s0); + comp_pred += 16; + ref += 16; + } + ref += ref_stride - width; + } + } else if (width >= 8) { + int i; + assert(!(width & 7)); + assert(!(height & 1)); + /*Read 8 pixels two rows at a time.*/ + for (i = 0; i < height; i += 2) { + __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); + __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); + __m128i t0 = _mm_unpacklo_epi64(s0, s1); + _mm_storeu_si128((__m128i *)comp_pred, t0); comp_pred += 16; - ref += 16 * 8; + ref += 2 * ref_stride; } - ref += stride - (width << 3); - } - } else if (width >= 8) { - // read 8 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 8) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); - __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); - __m128i t0, t1; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - t1 = _mm_unpacklo_epi8(s2, s3); - s3 = _mm_unpackhi_epi8(s2, s3); - - s0 = _mm_unpacklo_epi8(t0, s1); - s2 = _mm_unpacklo_epi8(t1, s3); - s0 = _mm_unpacklo_epi32(s0, s2); - - _mm_storel_epi64((__m128i *)(comp_pred), s0); - comp_pred += 8; - ref += 8 * 8; + } else { + int i; + assert(!(width & 3)); + assert(!(height & 3)); + /*Read 4 pixels four rows at a time.*/ + for (i = 0; i < height; i++) { + __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); + __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + ref_stride)); + __m128i s2 = + _mm_cvtsi32_si128(*(const uint32_t *)(ref + 2 * ref_stride)); + __m128i s3 = + _mm_cvtsi32_si128(*(const uint32_t *)(ref + 3 * ref_stride)); + __m128i t0 = _mm_unpacklo_epi32(s0, s1); + __m128i t1 = _mm_unpacklo_epi32(s2, s3); + __m128i u0 = _mm_unpacklo_epi64(t0, t1); + _mm_storeu_si128((__m128i *)comp_pred, u0); + comp_pred += 16; + ref += 4 * ref_stride; } - ref += stride - (width << 3); } } else { - // read 4 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 4) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i t0; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - s0 = _mm_unpacklo_epi8(t0, s1); - - *(int *)comp_pred = _mm_cvtsi128_si32(s0); - comp_pred += 4; - ref += 4 * 8; - } - ref += stride - (width << 3); + InterpFilterParams filter; + filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); + if (!subpel_y_q3) { + const int16_t *kernel; + kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, + -1, width, height); + } else if (!subpel_x_q3) { + const int16_t *kernel; + kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, + 16, width, height); + } else { + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *kernel_x; + const int16_t *kernel_y; + int intermediate_height; + kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), + MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, + width, height); } } } void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, const uint8_t *ref, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, int ref_stride) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - int i, j; - int stride = ref_stride << 3; - - if (width >= 16) { - // read 16 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 16) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); - __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); - __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64)); - __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80)); - __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96)); - __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112)); - __m128i p0 = _mm_loadu_si128((const __m128i *)pred); - __m128i p1; - __m128i t0, t1, t2, t3; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - t1 = _mm_unpacklo_epi8(s2, s3); - s3 = _mm_unpackhi_epi8(s2, s3); - t2 = _mm_unpacklo_epi8(s4, s5); - s5 = _mm_unpackhi_epi8(s4, s5); - t3 = _mm_unpacklo_epi8(s6, s7); - s7 = _mm_unpackhi_epi8(s6, s7); - - s0 = _mm_unpacklo_epi8(t0, s1); - s2 = _mm_unpacklo_epi8(t1, s3); - s4 = _mm_unpacklo_epi8(t2, s5); - s6 = _mm_unpacklo_epi8(t3, s7); - - s0 = _mm_unpacklo_epi32(s0, s2); - s4 = _mm_unpacklo_epi32(s4, s6); - s0 = _mm_unpacklo_epi8(s0, zero); - s4 = _mm_unpacklo_epi8(s4, zero); - - p1 = _mm_unpackhi_epi8(p0, zero); - p0 = _mm_unpacklo_epi8(p0, zero); - p0 = _mm_adds_epu16(s0, p0); - p1 = _mm_adds_epu16(s4, p1); - p0 = _mm_adds_epu16(p0, one); - p1 = _mm_adds_epu16(p1, one); - - p0 = _mm_srli_epi16(p0, 1); - p1 = _mm_srli_epi16(p1, 1); - p0 = _mm_packus_epi16(p0, p1); - - _mm_storeu_si128((__m128i *)(comp_pred), p0); - comp_pred += 16; - pred += 16; - ref += 16 * 8; - } - ref += stride - (width << 3); - } - } else if (width >= 8) { - // read 8 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 8) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32)); - __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48)); - __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); - __m128i t0, t1; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - t1 = _mm_unpacklo_epi8(s2, s3); - s3 = _mm_unpackhi_epi8(s2, s3); - - s0 = _mm_unpacklo_epi8(t0, s1); - s2 = _mm_unpacklo_epi8(t1, s3); - s0 = _mm_unpacklo_epi32(s0, s2); - s0 = _mm_unpacklo_epi8(s0, zero); - - p0 = _mm_unpacklo_epi8(p0, zero); - p0 = _mm_adds_epu16(s0, p0); - p0 = _mm_adds_epu16(p0, one); - p0 = _mm_srli_epi16(p0, 1); - p0 = _mm_packus_epi16(p0, zero); - - _mm_storel_epi64((__m128i *)(comp_pred), p0); - comp_pred += 8; - pred += 8; - ref += 8 * 8; - } - ref += stride - (width << 3); - } - } else { - // read 4 points at one time - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 4) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16)); - __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred); - __m128i t0; - - t0 = _mm_unpacklo_epi8(s0, s1); - s1 = _mm_unpackhi_epi8(s0, s1); - s0 = _mm_unpacklo_epi8(t0, s1); - s0 = _mm_unpacklo_epi8(s0, zero); - - p0 = _mm_unpacklo_epi8(p0, zero); - p0 = _mm_adds_epu16(s0, p0); - p0 = _mm_adds_epu16(p0, one); - p0 = _mm_srli_epi16(p0, 1); - p0 = _mm_packus_epi16(p0, zero); - - *(int *)comp_pred = _mm_cvtsi128_si32(p0); - comp_pred += 4; - pred += 4; - ref += 4 * 8; - } - ref += stride - (width << 3); - } + int n; + int i; + aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, + ref_stride); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + for (i = 0; i < n; i++) { + __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred); + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu8(s0, p0)); + comp_pred += 16; + pred += 16; } } -- cgit v1.2.3 From ec910d81405c736a4490383a250299a7837c2e64 Mon Sep 17 00:00:00 2001 From: trav90 Date: Thu, 18 Oct 2018 21:53:44 -0500 Subject: Update aom to commit id e87fb2378f01103d5d6e477a4ef6892dc714e614 --- third_party/aom/aom_dsp/aom_dsp.cmake | 124 +- third_party/aom/aom_dsp/aom_dsp.mk | 20 +- third_party/aom/aom_dsp/aom_dsp_common.h | 7 +- third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl | 1761 +++++++++++--------- third_party/aom/aom_dsp/arm/intrapred_neon.c | 225 --- third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm | 346 ---- third_party/aom/aom_dsp/binary_codes_reader.c | 49 +- third_party/aom/aom_dsp/binary_codes_reader.h | 5 + third_party/aom/aom_dsp/binary_codes_writer.c | 60 +- third_party/aom/aom_dsp/binary_codes_writer.h | 5 + third_party/aom/aom_dsp/bitreader.h | 15 + third_party/aom/aom_dsp/bitwriter.h | 17 +- third_party/aom/aom_dsp/buf_ans.c | 3 +- third_party/aom/aom_dsp/buf_ans.h | 3 +- third_party/aom/aom_dsp/daalaboolreader.c | 2 +- third_party/aom/aom_dsp/daalaboolreader.h | 5 +- third_party/aom/aom_dsp/daalaboolwriter.c | 4 - third_party/aom/aom_dsp/daalaboolwriter.h | 4 - third_party/aom/aom_dsp/entcode.h | 8 +- third_party/aom/aom_dsp/entdec.c | 55 +- third_party/aom/aom_dsp/entdec.h | 6 +- third_party/aom/aom_dsp/entenc.c | 41 +- third_party/aom/aom_dsp/intrapred.c | 84 +- third_party/aom/aom_dsp/intrapred_common.h | 51 + third_party/aom/aom_dsp/inv_txfm.c | 49 +- third_party/aom/aom_dsp/inv_txfm.h | 9 +- third_party/aom/aom_dsp/loopfilter.c | 326 +++- third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c | 5 + third_party/aom/aom_dsp/mips/convolve8_dspr2.c | 7 + third_party/aom/aom_dsp/mips/intrapred16_dspr2.c | 2 + third_party/aom/aom_dsp/mips/intrapred4_dspr2.c | 145 +- third_party/aom/aom_dsp/mips/intrapred8_dspr2.c | 455 +---- third_party/aom/aom_dsp/mips/intrapred_msa.c | 190 --- third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h | 4 +- third_party/aom/aom_dsp/prob.h | 22 +- third_party/aom/aom_dsp/psnr.c | 34 +- third_party/aom/aom_dsp/psnr.h | 2 + third_party/aom/aom_dsp/quantize.c | 781 ++------- third_party/aom/aom_dsp/quantize.h | 77 +- third_party/aom/aom_dsp/sad.c | 44 +- third_party/aom/aom_dsp/ssim.c | 56 +- third_party/aom/aom_dsp/txfm_common.h | 659 +++++++- third_party/aom/aom_dsp/variance.c | 60 +- third_party/aom/aom_dsp/variance.h | 6 +- .../aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm | 8 +- third_party/aom/aom_dsp/x86/common_avx2.h | 147 ++ third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h | 24 +- third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h | 20 +- .../aom/aom_dsp/x86/highbd_intrapred_avx2.c | 239 +++ .../aom/aom_dsp/x86/highbd_intrapred_sse2.asm | 197 --- .../aom/aom_dsp/x86/highbd_intrapred_sse2.c | 1256 ++++++++++++++ .../aom/aom_dsp/x86/highbd_intrapred_ssse3.c | 521 ++++++ .../aom/aom_dsp/x86/highbd_loopfilter_avx2.c | 873 ++++++++++ .../aom/aom_dsp/x86/highbd_loopfilter_sse2.c | 1010 +++++------ third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm | 2 + third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm | 7 +- third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c | 249 +-- third_party/aom/aom_dsp/x86/highbd_variance_sse2.c | 10 +- third_party/aom/aom_dsp/x86/intrapred_avx2.c | 413 +++++ third_party/aom/aom_dsp/x86/intrapred_sse2.asm | 146 -- third_party/aom/aom_dsp/x86/intrapred_sse2.c | 684 ++++++++ third_party/aom/aom_dsp/x86/intrapred_ssse3.c | 885 ++++++++++ third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h | 22 +- third_party/aom/aom_dsp/x86/inv_txfm_sse2.h | 12 +- third_party/aom/aom_dsp/x86/loopfilter_sse2.c | 144 +- third_party/aom/aom_dsp/x86/lpf_common_sse2.h | 130 ++ .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.c | 16 +- .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.c | 14 +- third_party/aom/aom_dsp/x86/obmc_sad_sse4.c | 4 + third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | 16 +- third_party/aom/aom_dsp/x86/quantize_sse2.c | 38 +- third_party/aom/aom_dsp/x86/sad4d_sse2.asm | 2 + third_party/aom/aom_dsp/x86/sad_sse2.asm | 6 + third_party/aom/aom_dsp/x86/txfm_common_avx2.h | 130 +- third_party/aom/aom_dsp/x86/txfm_common_intrin.h | 20 +- third_party/aom/aom_dsp/x86/variance_sse2.c | 30 +- 76 files changed, 8530 insertions(+), 4578 deletions(-) create mode 100644 third_party/aom/aom_dsp/intrapred_common.h create mode 100644 third_party/aom/aom_dsp/x86/common_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/intrapred_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/intrapred_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/lpf_common_sse2.h (limited to 'third_party/aom/aom_dsp') diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index 3ce6761ca..11b55caa7 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -23,6 +23,7 @@ set(AOM_DSP_COMMON_SOURCES "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c" "${AOM_ROOT}/aom_dsp/intrapred.c" + "${AOM_ROOT}/aom_dsp/intrapred_common.h" "${AOM_ROOT}/aom_dsp/loopfilter.c" "${AOM_ROOT}/aom_dsp/prob.c" "${AOM_ROOT}/aom_dsp/prob.h" @@ -45,7 +46,9 @@ set(AOM_DSP_COMMON_ASM_SSE2 set(AOM_DSP_COMMON_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" "${AOM_ROOT}/aom_dsp/x86/convolve.h" + "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c" "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h" "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c") set(AOM_DSP_COMMON_ASM_SSSE3 @@ -55,6 +58,7 @@ set(AOM_DSP_COMMON_ASM_SSSE3 set(AOM_DSP_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c") set(AOM_DSP_COMMON_INTRIN_SSE4_1 @@ -64,16 +68,28 @@ set(AOM_DSP_COMMON_INTRIN_SSE4_1 set(AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c" "${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/common_avx2.h" "${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h" "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h") +if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_INTRIN_AVX2 + ${AOM_DSP_COMMON_INTRIN_AVX2} + "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c") +endif () + +if (NOT CONFIG_EXT_PARTITION) + set(AOM_DSP_COMMON_ASM_NEON + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm") +endif () + set(AOM_DSP_COMMON_ASM_NEON - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm" + ${AOM_DSP_COMMON_ASM_NEON} "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.asm" "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.asm" "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.asm" @@ -83,33 +99,53 @@ set(AOM_DSP_COMMON_ASM_NEON "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm" "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm" "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm" "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm") +if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_ASM_NEON + ${AOM_DSP_COMMON_ASM_NEON} + "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm") +endif () + +if (NOT CONFIG_EXT_PARTITION) + set(AOM_DSP_COMMON_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c") +endif () + set(AOM_DSP_COMMON_INTRIN_NEON - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c" + ${AOM_DSP_COMMON_INTRIN_NEON} "${AOM_ROOT}/aom_dsp/arm/avg_neon.c" "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c" "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c" "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c" "${AOM_ROOT}/aom_dsp/arm/sad_neon.c" "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c" "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" "${AOM_ROOT}/aom_dsp/arm/variance_neon.c") +if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_INTRIN_NEON + ${AOM_DSP_COMMON_INTRIN_NEON} + "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c") +endif () + if ("${AOM_TARGET_CPU}" STREQUAL "arm64") + if (NOT CONFIG_EXT_PARTITION) + set(AOM_DSP_COMMON_INTRIN_NEON + ${AOM_DSP_COMMON_INTRIN_NEON} + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c" + "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c") + endif () + set(AOM_DSP_COMMON_INTRIN_NEON ${AOM_DSP_COMMON_INTRIN_NEON} - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.c" @@ -118,10 +154,15 @@ if ("${AOM_TARGET_CPU}" STREQUAL "arm64") "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c" "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c" - "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c") + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c") + + if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_INTRIN_NEON + ${AOM_DSP_COMMON_INTRIN_NEON} + "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c") + endif () endif () set(AOM_DSP_COMMON_INTRIN_DSPR2 @@ -141,14 +182,19 @@ set(AOM_DSP_COMMON_INTRIN_DSPR2 "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c" "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c" "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c") + "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h") + +if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_INTRIN_DSPR2 + ${AOM_DSP_COMMON_INTRIN_DSPR2} + "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c") +endif () set(AOM_DSP_COMMON_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c" @@ -169,13 +215,18 @@ set(AOM_DSP_COMMON_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c" "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c" "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h" "${AOM_ROOT}/aom_dsp/mips/macros_msa.h" "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h") +if (NOT CONFIG_PARALLEL_DEBLOCKING) + set(AOM_DSP_COMMON_INTRIN_MSA + ${AOM_DSP_COMMON_INTRIN_MSA} + "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c" + "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h") +endif () + if (CONFIG_HIGHBITDEPTH) set(AOM_DSP_COMMON_ASM_SSE2 ${AOM_DSP_COMMON_ASM_SSE2} @@ -185,11 +236,18 @@ if (CONFIG_HIGHBITDEPTH) set(AOM_DSP_COMMON_INTRIN_SSE2 ${AOM_DSP_COMMON_INTRIN_SSE2} + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c") + set(AOM_DSP_COMMON_INTRIN_SSSE3 + ${AOM_DSP_COMMON_INTRIN_SSSE3} + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_ssse3.c") + set(AOM_DSP_COMMON_INTRIN_AVX2 ${AOM_DSP_COMMON_INTRIN_AVX2} - "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c") + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c") else () set(AOM_DSP_COMMON_INTRIN_DSPR2 ${AOM_DSP_COMMON_INTRIN_DSPR2} @@ -332,12 +390,10 @@ if (CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/mips/variance_msa.c" "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c") - if (CONFIG_EXT_INTER) set(AOM_DSP_ENCODER_INTRIN_SSSE3 ${AOM_DSP_ENCODER_INTRIN_SSSE3} "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c") - endif () if (CONFIG_HIGHBITDEPTH) set(AOM_DSP_ENCODER_INTRIN_SSE2 diff --git a/third_party/aom/aom_dsp/aom_dsp.mk b/third_party/aom/aom_dsp/aom_dsp.mk index f9d675ac0..950db0216 100644 --- a/third_party/aom/aom_dsp/aom_dsp.mk +++ b/third_party/aom/aom_dsp/aom_dsp.mk @@ -64,6 +64,7 @@ endif # intra predictions DSP_SRCS-yes += intrapred.c +DSP_SRCS-yes += intrapred_common.h ifneq ($(CONFIG_ANS),yes) DSP_SRCS-yes += entcode.c @@ -75,9 +76,16 @@ DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm +DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.c +DSP_SRCS-$(HAVE_AVX2) += x86/intrapred_avx2.c + ifeq ($(CONFIG_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_ssse3.c +DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_avx2.c endif # CONFIG_HIGHBITDEPTH DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) @@ -120,6 +128,7 @@ DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c endif DSP_SRCS-$(HAVE_SSE2) += x86/aom_convolve_copy_sse2.asm +ifneq ($(CONFIG_EXT_PARTITION),yes) ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM) DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM) @@ -135,6 +144,7 @@ DSP_SRCS-yes += arm/aom_convolve_avg_neon.c DSP_SRCS-yes += arm/aom_convolve_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM +endif # CONFIG_EXT_PARTITION # common (msa) DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c @@ -164,7 +174,10 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c DSP_SRCS-yes += loopfilter.c DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c -DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c +DSP_SRCS-$(HAVE_SSE2) += x86/lpf_common_sse2.h + +ifneq ($(CONFIG_PARALLEL_DEBLOCKING),yes) +DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c ifeq ($(HAVE_NEON_ASM),yes) @@ -191,13 +204,16 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_masks_dspr2.h DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c +endif # !CONFIG_PARALLEL_DEBLOCKING ifeq ($(CONFIG_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c +DSP_SRCS-$(HAVE_AVX2) += x86/highbd_loopfilter_avx2.c endif # CONFIG_HIGHBITDEPTH DSP_SRCS-yes += txfm_common.h DSP_SRCS-yes += x86/txfm_common_intrin.h +DSP_SRCS-$(HAVE_AVX2) += x86/common_avx2.h DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h DSP_SRCS-$(HAVE_SSSE3) += x86/obmc_intrinsic_ssse3.h DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h @@ -343,10 +359,8 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_highbd_avx2.c endif ifeq ($(CONFIG_AV1_ENCODER),yes) -ifeq ($(CONFIG_EXT_INTER),yes) DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c -endif #CONFIG_EXT_INTER ifeq ($(CONFIG_MOTION_VAR),yes) DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h index 5b104321b..3d3bcba37 100644 --- a/third_party/aom/aom_dsp/aom_dsp_common.h +++ b/third_party/aom/aom_dsp/aom_dsp_common.h @@ -52,10 +52,9 @@ extern "C" { #define UNLIKELY(v) (v) #endif -#if CONFIG_AOM_QM typedef uint16_t qm_val_t; #define AOM_QM_BITS 5 -#endif + #if CONFIG_HIGHBITDEPTH // Note: // tran_low_t is the datatype used for final transform coefficients. @@ -78,6 +77,10 @@ static INLINE int clamp(int value, int low, int high) { return value < low ? low : (value > high ? high : value); } +static INLINE uint32_t clamp32u(uint32_t value, uint32_t low, uint32_t high) { + return value < low ? low : (value > high ? high : value); +} + static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) { return value < low ? low : (value > high ? high : value); } diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index 0c0356870..f4f6c64d4 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -40,11 +40,17 @@ foreach $w (@block_widths) { push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ; } } -if (aom_config("CONFIG_EXT_PARTITION_TYPES")) { +if (aom_config("CONFIG_EXT_PARTITION_TYPES") eq "yes") { push @block_sizes, [4, 16]; push @block_sizes, [16, 4]; push @block_sizes, [8, 32]; push @block_sizes, [32, 8]; + push @block_sizes, [16, 64]; + push @block_sizes, [64, 16]; + if (aom_config("CONFIG_EXT_PARTITION") eq "yes") { + push @block_sizes, [32, 128]; + push @block_sizes, [128, 32]; + } } @tx_dims = (2, 4, 8, 16, 32); @@ -60,14 +66,9 @@ foreach $w (@tx_dims) { } } -@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153/; -if (aom_config("CONFIG_ALT_INTRA") eq "yes") { - push @pred_names, qw/paeth smooth/; - if (aom_config("CONFIG_SMOOTH_HV") eq "yes") { - push @pred_names, qw/smooth_v smooth_h/; - } -} else { - push @pred_names, 'tm'; +@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153 paeth smooth/; +if (aom_config("CONFIG_SMOOTH_HV") eq "yes") { + push @pred_names, qw/smooth_v smooth_h/; } # @@ -86,70 +87,185 @@ foreach (@tx_sizes) { } } -specialize qw/aom_d63e_predictor_4x4 ssse3/; -specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/; -specialize qw/aom_d135_predictor_4x4 neon/; -specialize qw/aom_d153_predictor_4x4 ssse3/; -specialize qw/aom_v_predictor_4x4 neon msa sse2/; -if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_tm_predictor_4x4 neon dspr2 msa sse2/; -} # CONFIG_ALT_INTRA -specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/; specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/; +specialize qw/aom_dc_top_predictor_4x8 sse2/; +specialize qw/aom_dc_top_predictor_8x4 sse2/; +specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/; +specialize qw/aom_dc_top_predictor_8x16 sse2/; +specialize qw/aom_dc_top_predictor_16x8 sse2/; +specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/; +specialize qw/aom_dc_top_predictor_16x32 sse2/; +specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/; specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/; +specialize qw/aom_dc_left_predictor_4x8 sse2/; +specialize qw/aom_dc_left_predictor_8x4 sse2/; +specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/; +specialize qw/aom_dc_left_predictor_8x16 sse2/; +specialize qw/aom_dc_left_predictor_16x8 sse2/; +specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/; +specialize qw/aom_dc_left_predictor_16x32 sse2/; +specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/; specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/; +specialize qw/aom_dc_128_predictor_4x8 sse2/; +specialize qw/aom_dc_128_predictor_8x4 sse2/; +specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/; +specialize qw/aom_dc_128_predictor_8x16 sse2/; +specialize qw/aom_dc_128_predictor_16x8 sse2/; +specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/; +specialize qw/aom_dc_128_predictor_16x32 sse2/; +specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_v_predictor_4x4 neon msa sse2/; +specialize qw/aom_v_predictor_4x8 sse2/; +specialize qw/aom_v_predictor_8x4 sse2/; +specialize qw/aom_v_predictor_8x8 neon msa sse2/; +specialize qw/aom_v_predictor_8x16 sse2/; +specialize qw/aom_v_predictor_16x8 sse2/; +specialize qw/aom_v_predictor_16x16 neon msa sse2/; +specialize qw/aom_v_predictor_16x32 sse2/; +specialize qw/aom_v_predictor_32x16 sse2 avx2/; +specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/; +specialize qw/aom_h_predictor_4x8 sse2/; +specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/; +specialize qw/aom_h_predictor_8x4 sse2/; specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/; +specialize qw/aom_h_predictor_8x16 sse2/; +specialize qw/aom_h_predictor_16x8 sse2/; +specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/; +specialize qw/aom_h_predictor_16x32 sse2/; +specialize qw/aom_h_predictor_32x16 sse2/; +specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/; +specialize qw/aom_paeth_predictor_4x4 ssse3/; +specialize qw/aom_paeth_predictor_4x8 ssse3/; +specialize qw/aom_paeth_predictor_8x4 ssse3/; +specialize qw/aom_paeth_predictor_8x8 ssse3/; +specialize qw/aom_paeth_predictor_8x16 ssse3/; +specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/; +specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/; +specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x8 ssse3/; +specialize qw/aom_paeth_predictor_16x16 ssse3/; +specialize qw/aom_paeth_predictor_16x32 ssse3/; +specialize qw/aom_paeth_predictor_32x16 ssse3/; +specialize qw/aom_paeth_predictor_32x32 ssse3/; +specialize qw/aom_smooth_predictor_4x4 ssse3/; +specialize qw/aom_smooth_predictor_4x8 ssse3/; +specialize qw/aom_smooth_predictor_8x4 ssse3/; +specialize qw/aom_smooth_predictor_8x8 ssse3/; +specialize qw/aom_smooth_predictor_8x16 ssse3/; +specialize qw/aom_smooth_predictor_16x8 ssse3/; +specialize qw/aom_smooth_predictor_16x16 ssse3/; +specialize qw/aom_smooth_predictor_16x32 ssse3/; +specialize qw/aom_smooth_predictor_32x16 ssse3/; +specialize qw/aom_smooth_predictor_32x32 ssse3/; + +specialize qw/aom_d63e_predictor_4x4 ssse3/; +specialize qw/aom_d135_predictor_4x4 neon/; +specialize qw/aom_d153_predictor_4x4 ssse3/; +specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/; +specialize qw/aom_dc_predictor_4x8 sse2/; specialize qw/aom_d153_predictor_8x8 ssse3/; -specialize qw/aom_v_predictor_8x8 neon msa sse2/; -if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_tm_predictor_8x8 neon dspr2 msa sse2/; -} # CONFIG_ALT_INTRA +specialize qw/aom_dc_predictor_8x4 sse2/; specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/; -specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/; -specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/; -specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/; -specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/; +specialize qw/aom_dc_predictor_8x16 sse2/; specialize qw/aom_d153_predictor_16x16 ssse3/; -specialize qw/aom_v_predictor_16x16 neon msa sse2/; -if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_tm_predictor_16x16 neon msa sse2/; -} # CONFIG_ALT_INTRA +specialize qw/aom_dc_predictor_16x8 sse2/; specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/; -specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/; -specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/; -specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/; -specialize qw/aom_h_predictor_32x32 neon msa sse2/; +specialize qw/aom_dc_predictor_16x32 sse2/; specialize qw/aom_d153_predictor_32x32 ssse3/; -specialize qw/aom_v_predictor_32x32 neon msa sse2/; -if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_tm_predictor_32x32 neon msa sse2/; -} # CONFIG_ALT_INTRA -specialize qw/aom_dc_predictor_32x32 msa neon sse2/; -specialize qw/aom_dc_top_predictor_32x32 msa neon sse2/; -specialize qw/aom_dc_left_predictor_32x32 msa neon sse2/; -specialize qw/aom_dc_128_predictor_32x32 msa neon sse2/; + +specialize qw/aom_dc_predictor_32x16 sse2 avx2/; +specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/; if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { specialize qw/aom_highbd_v_predictor_4x4 sse2/; - if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_highbd_tm_predictor_4x4 sse2/; - } # CONFIG_ALT_INTRA - specialize qw/aom_highbd_dc_predictor_4x4 sse2/; + specialize qw/aom_highbd_v_predictor_4x8 sse2/; + specialize qw/aom_highbd_v_predictor_8x4 sse2/; specialize qw/aom_highbd_v_predictor_8x8 sse2/; - if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_highbd_tm_predictor_8x8 sse2/; - } # CONFIG_ALT_INTRA - specialize qw/aom_highbd_dc_predictor_8x8 sse2/;; + specialize qw/aom_highbd_v_predictor_8x16 sse2/; + specialize qw/aom_highbd_v_predictor_16x8 sse2/; specialize qw/aom_highbd_v_predictor_16x16 sse2/; - if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_highbd_tm_predictor_16x16 sse2/; - } # CONFIG_ALT_INTRA - specialize qw/aom_highbd_dc_predictor_16x16 sse2/; + specialize qw/aom_highbd_v_predictor_16x32 sse2/; + specialize qw/aom_highbd_v_predictor_32x16 sse2/; specialize qw/aom_highbd_v_predictor_32x32 sse2/; - if (aom_config("CONFIG_ALT_INTRA") eq "") { - specialize qw/aom_highbd_tm_predictor_32x32 sse2/; - } # CONFIG_ALT_INTRA + specialize qw/aom_highbd_dc_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_predictor_8x4 sse2/;; + specialize qw/aom_highbd_dc_predictor_8x8 sse2/;; + specialize qw/aom_highbd_dc_predictor_8x16 sse2/;; + specialize qw/aom_highbd_dc_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_predictor_32x16 sse2/; specialize qw/aom_highbd_dc_predictor_32x32 sse2/; + specialize qw/aom_highbd_h_predictor_4x4 sse2/; + specialize qw/aom_highbd_h_predictor_4x8 sse2/; + specialize qw/aom_highbd_h_predictor_8x4 sse2/; + specialize qw/aom_highbd_h_predictor_8x8 sse2/; + specialize qw/aom_highbd_h_predictor_8x16 sse2/; + specialize qw/aom_highbd_h_predictor_16x8 sse2/; + specialize qw/aom_highbd_h_predictor_16x16 sse2/; + specialize qw/aom_highbd_h_predictor_16x32 sse2/; + specialize qw/aom_highbd_h_predictor_32x16 sse2/; + specialize qw/aom_highbd_h_predictor_32x32 sse2/; + specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/; + specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/; + specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/; + specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/; + specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/; + specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/; + specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/; + specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/; + specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/; + specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/; + specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/; + specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/; + specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/; + specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/; + specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/; + specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/; + + specialize qw/aom_highbd_d117_predictor_4x4 sse2/; + specialize qw/aom_highbd_d117_predictor_8x8 ssse3/; + specialize qw/aom_highbd_d117_predictor_16x16 ssse3/; + specialize qw/aom_highbd_d117_predictor_32x32 ssse3/; + specialize qw/aom_highbd_d135_predictor_4x4 sse2/; + specialize qw/aom_highbd_d135_predictor_8x8 ssse3/; + specialize qw/aom_highbd_d135_predictor_16x16 ssse3/; + specialize qw/aom_highbd_d135_predictor_32x32 ssse3/; + specialize qw/aom_highbd_d153_predictor_4x4 sse2/; + specialize qw/aom_highbd_d153_predictor_8x8 ssse3/; + specialize qw/aom_highbd_d153_predictor_16x16 ssse3/; + specialize qw/aom_highbd_d153_predictor_32x32 ssse3/; + + specialize qw/aom_highbd_d45e_predictor_4x4 sse2/; + specialize qw/aom_highbd_d45e_predictor_4x8 sse2/; + specialize qw/aom_highbd_d45e_predictor_8x4 sse2/; + specialize qw/aom_highbd_d45e_predictor_8x8 sse2/; + specialize qw/aom_highbd_d45e_predictor_8x16 sse2/; + specialize qw/aom_highbd_d45e_predictor_16x8 avx2/; + specialize qw/aom_highbd_d45e_predictor_16x16 avx2/; + specialize qw/aom_highbd_d45e_predictor_16x32 avx2/; + specialize qw/aom_highbd_d45e_predictor_32x16 avx2/; + specialize qw/aom_highbd_d45e_predictor_32x32 avx2/; } # CONFIG_HIGHBITDEPTH # @@ -257,83 +373,121 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { # Loopfilter # add_proto qw/void aom_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/; -$aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_vertical_16 sse2/; +} else { + specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/; + $aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon; +} add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/; -$aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { + specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/; + $aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon; +} add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_vertical_8 sse2/; +} else { + specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/; +} add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; -$aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { + specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; + $aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon; +} add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_vertical_4 sse2/; +} else { + specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/; +} add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { + specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/; +} add_proto qw/void aom_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/; -$aom_lpf_horizontal_edge_8_neon_asm=aom_lpf_horizontal_edge_8_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_horizontal_edge_8 sse2/; +} else { + specialize qw/aom_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/; + $aom_lpf_horizontal_edge_8_neon_asm=aom_lpf_horizontal_edge_8_neon; +} add_proto qw/void aom_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/; -$aom_lpf_horizontal_edge_16_neon_asm=aom_lpf_horizontal_edge_16_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_horizontal_edge_16 sse2/; +} else { + specialize qw/aom_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/; + $aom_lpf_horizontal_edge_16_neon_asm=aom_lpf_horizontal_edge_16_neon; +} add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_horizontal_8 sse2/; +} else { + specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/; +} add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; -$aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { + specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; + $aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon; +} add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { + specialize qw/aom_lpf_horizontal_4 sse2/; +} else { + specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/; +} add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; +if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { + specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; +} if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_16 sse2/; add_proto qw/void aom_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_vertical_16_dual sse2/; + specialize qw/aom_highbd_lpf_vertical_16_dual sse2 avx2/; add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_8 sse2/; add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_vertical_8_dual sse2/; + specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/; add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_vertical_4 sse2/; add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_vertical_4_dual sse2/; + specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/; add_proto qw/void aom_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_edge_8 sse2/; add_proto qw/void aom_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_horizontal_edge_16 sse2/; + specialize qw/aom_highbd_lpf_horizontal_edge_16 sse2 avx2/; add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_8 sse2/; add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_horizontal_8_dual sse2/; + specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/; add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; specialize qw/aom_highbd_lpf_horizontal_4 sse2/; add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_horizontal_4_dual sse2/; + specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/; } # CONFIG_HIGHBITDEPTH # @@ -412,51 +566,48 @@ if (aom_config("CONFIG_AV1") eq "yes") { add_proto qw/void aom_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - { - add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct4x4_16_add sse2/; + add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct4x4_16_add sse2/; - add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct4x4_1_add sse2/; + add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct4x4_1_add sse2/; - add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_64_add sse2 ssse3/; + add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct8x8_64_add sse2 ssse3/; - add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_12_add sse2 ssse3/; + add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct8x8_12_add sse2 ssse3/; - add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_1_add sse2/; + add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct8x8_1_add sse2/; - add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_256_add sse2 avx2/; + add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct16x16_256_add sse2 avx2/; - add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_38_add avx2/; + add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct16x16_38_add avx2/; - add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_10_add sse2 avx2/; + add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct16x16_10_add sse2 avx2/; - add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_1_add sse2 avx2/; + add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct16x16_1_add sse2 avx2/; - add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/; + add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/; - add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/; - # Need to add 135 eob idct32x32 implementations. - $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2; + add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/; + # Need to add 135 eob idct32x32 implementations. + $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2; - add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/; + add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/; - add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1_add sse2 avx2/; - } -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { -} else { - { + add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + specialize qw/aom_idct32x32_1_add sse2 avx2/; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + } else { add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/aom_idct4x4_1_add sse2 neon dspr2 msa/; @@ -508,48 +659,32 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/aom_iwht4x4_16_add msa sse2/; - } -} # CONFIG_HIGHBITDEPTH + } # CONFIG_HIGHBITDEPTH } # CONFIG_AV1 # # Quantization # -if (aom_config("CONFIG_AOM_QM") eq "yes") { - if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr"; - - } # CONFIG_AV1_ENCODER -} else { - if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64"; +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64"; - add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64"; + add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64"; - add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; +} # CONFIG_AV1_ENCODER - add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_highbd_quantize_b sse2 avx2/; +if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b sse2 avx2/; - add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/aom_highbd_quantize_b_32x32 sse2/; + add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_highbd_quantize_b_32x32 sse2/; - add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - } # CONFIG_AV1_ENCODER -} # CONFIG_AOM_QM +} # CONFIG_AV1_ENCODER if (aom_config("CONFIG_AV1") eq "yes") { # # Alpha blending with mask @@ -574,148 +709,147 @@ if (aom_config("CONFIG_AV1") eq "yes") { } } # CONFIG_AV1 -if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { -# -# Block subtraction -# -add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/aom_subtract_block neon msa sse2/; - -if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { -# -# Sum of Squares -# -add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; -specialize qw/aom_sum_squares_2d_i16 sse2/; - -add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; -specialize qw/aom_sum_squares_i16 sse2/; -} - - -# -# Avg -# if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # - # Avg + # Block subtraction # - specialize qw/aom_avg_8x8 sse2 neon msa/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; - specialize qw/aom_highbd_subtract_block sse2/; + add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; + specialize qw/aom_subtract_block neon msa sse2/; + + if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + # + # Sum of Squares + # + add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; + specialize qw/aom_sum_squares_2d_i16 sse2/; + + add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; + specialize qw/aom_sum_squares_i16 sse2/; } + # - # Minmax + # Avg # - add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; - specialize qw/aom_minmax_8x8 sse2 neon/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; - } - - add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/aom_hadamard_8x8 sse2 neon/, "$ssse3_x86_64"; + if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { + # + # Avg + # + specialize qw/aom_avg_8x8 sse2 neon msa/; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; + specialize qw/aom_highbd_subtract_block sse2/; + } - add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/aom_hadamard_16x16 sse2 neon/; + # + # Minmax + # + add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + specialize qw/aom_minmax_8x8 sse2 neon/; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + } - add_proto qw/int aom_satd/, "const int16_t *coeff, int length"; - specialize qw/aom_satd sse2 neon/; + add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; + specialize qw/aom_hadamard_8x8 sse2 neon/, "$ssse3_x86_64"; - add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, int ref_stride, int height"; - specialize qw/aom_int_pro_row sse2 neon/; + add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; + specialize qw/aom_hadamard_16x16 sse2 neon/; - add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, int width"; - specialize qw/aom_int_pro_col sse2 neon/; + add_proto qw/int aom_satd/, "const int16_t *coeff, int length"; + specialize qw/aom_satd sse2 neon/; - add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl"; - specialize qw/aom_vector_var neon sse2/; -} # CONFIG_AV1_ENCODER + add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, int ref_stride, int height"; + specialize qw/aom_int_pro_row sse2 neon/; -# -# Single block SAD / Single block Avg SAD -# -foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; -} + add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, int width"; + specialize qw/aom_int_pro_col sse2 neon/; -specialize qw/aom_sad128x128 avx2 sse2/; -specialize qw/aom_sad128x64 avx2 sse2/; -specialize qw/aom_sad64x128 avx2 sse2/; -specialize qw/aom_sad64x64 avx2 neon msa sse2/; -specialize qw/aom_sad64x32 avx2 msa sse2/; -specialize qw/aom_sad32x64 avx2 msa sse2/; -specialize qw/aom_sad32x32 avx2 neon msa sse2/; -specialize qw/aom_sad32x16 avx2 msa sse2/; -specialize qw/aom_sad16x32 msa sse2/; -specialize qw/aom_sad16x16 neon msa sse2/; -specialize qw/aom_sad16x8 neon msa sse2/; -specialize qw/aom_sad8x16 neon msa sse2/; -specialize qw/aom_sad8x8 neon msa sse2/; -specialize qw/aom_sad8x4 msa sse2/; -specialize qw/aom_sad4x8 msa sse2/; -specialize qw/aom_sad4x4 neon msa sse2/; - -specialize qw/aom_sad128x128_avg avx2 sse2/; -specialize qw/aom_sad128x64_avg avx2 sse2/; -specialize qw/aom_sad64x128_avg avx2 sse2/; -specialize qw/aom_sad64x64_avg avx2 msa sse2/; -specialize qw/aom_sad64x32_avg avx2 msa sse2/; -specialize qw/aom_sad32x64_avg avx2 msa sse2/; -specialize qw/aom_sad32x32_avg avx2 msa sse2/; -specialize qw/aom_sad32x16_avg avx2 msa sse2/; -specialize qw/aom_sad16x32_avg msa sse2/; -specialize qw/aom_sad16x16_avg msa sse2/; -specialize qw/aom_sad16x8_avg msa sse2/; -specialize qw/aom_sad8x16_avg msa sse2/; -specialize qw/aom_sad8x8_avg msa sse2/; -specialize qw/aom_sad8x4_avg msa sse2/; -specialize qw/aom_sad4x8_avg msa sse2/; -specialize qw/aom_sad4x4_avg msa sse2/; + add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl"; + specialize qw/aom_vector_var neon sse2/; + } # CONFIG_AV1_ENCODER -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + # + # Single block SAD / Single block Avg SAD + # foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; - add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - if ($w != 128 && $h != 128 && $w != 4) { - specialize "aom_highbd_sad${w}x${h}", qw/sse2/; - specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/; + add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + } + + specialize qw/aom_sad128x128 avx2 sse2/; + specialize qw/aom_sad128x64 avx2 sse2/; + specialize qw/aom_sad64x128 avx2 sse2/; + specialize qw/aom_sad64x64 avx2 neon msa sse2/; + specialize qw/aom_sad64x32 avx2 msa sse2/; + specialize qw/aom_sad32x64 avx2 msa sse2/; + specialize qw/aom_sad32x32 avx2 neon msa sse2/; + specialize qw/aom_sad32x16 avx2 msa sse2/; + specialize qw/aom_sad16x32 msa sse2/; + specialize qw/aom_sad16x16 neon msa sse2/; + specialize qw/aom_sad16x8 neon msa sse2/; + specialize qw/aom_sad8x16 neon msa sse2/; + specialize qw/aom_sad8x8 neon msa sse2/; + specialize qw/aom_sad8x4 msa sse2/; + specialize qw/aom_sad4x8 msa sse2/; + specialize qw/aom_sad4x4 neon msa sse2/; + + specialize qw/aom_sad128x128_avg avx2 sse2/; + specialize qw/aom_sad128x64_avg avx2 sse2/; + specialize qw/aom_sad64x128_avg avx2 sse2/; + specialize qw/aom_sad64x64_avg avx2 msa sse2/; + specialize qw/aom_sad64x32_avg avx2 msa sse2/; + specialize qw/aom_sad32x64_avg avx2 msa sse2/; + specialize qw/aom_sad32x32_avg avx2 msa sse2/; + specialize qw/aom_sad32x16_avg avx2 msa sse2/; + specialize qw/aom_sad16x32_avg msa sse2/; + specialize qw/aom_sad16x16_avg msa sse2/; + specialize qw/aom_sad16x8_avg msa sse2/; + specialize qw/aom_sad8x16_avg msa sse2/; + specialize qw/aom_sad8x8_avg msa sse2/; + specialize qw/aom_sad8x4_avg msa sse2/; + specialize qw/aom_sad4x8_avg msa sse2/; + specialize qw/aom_sad4x4_avg msa sse2/; + + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; + add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + if ($w != 128 && $h != 128 && $w != 4) { + specialize "aom_highbd_sad${w}x${h}", qw/sse2/; + specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/; + } } + specialize qw/aom_highbd_sad128x128 avx2/; + specialize qw/aom_highbd_sad128x64 avx2/; + specialize qw/aom_highbd_sad64x128 avx2/; + specialize qw/aom_highbd_sad64x64 avx2/; + specialize qw/aom_highbd_sad64x32 avx2/; + specialize qw/aom_highbd_sad32x64 avx2/; + specialize qw/aom_highbd_sad32x32 avx2/; + specialize qw/aom_highbd_sad32x16 avx2/; + specialize qw/aom_highbd_sad16x32 avx2/; + specialize qw/aom_highbd_sad16x16 avx2/; + specialize qw/aom_highbd_sad16x8 avx2/; + + specialize qw/aom_highbd_sad128x128_avg avx2/; + specialize qw/aom_highbd_sad128x64_avg avx2/; + specialize qw/aom_highbd_sad64x128_avg avx2/; + specialize qw/aom_highbd_sad64x64_avg avx2/; + specialize qw/aom_highbd_sad64x32_avg avx2/; + specialize qw/aom_highbd_sad32x64_avg avx2/; + specialize qw/aom_highbd_sad32x32_avg avx2/; + specialize qw/aom_highbd_sad32x16_avg avx2/; + specialize qw/aom_highbd_sad16x32_avg avx2/; + specialize qw/aom_highbd_sad16x16_avg avx2/; + specialize qw/aom_highbd_sad16x8_avg avx2/; } - specialize qw/aom_highbd_sad128x128 avx2/; - specialize qw/aom_highbd_sad128x64 avx2/; - specialize qw/aom_highbd_sad64x128 avx2/; - specialize qw/aom_highbd_sad64x64 avx2/; - specialize qw/aom_highbd_sad64x32 avx2/; - specialize qw/aom_highbd_sad32x64 avx2/; - specialize qw/aom_highbd_sad32x32 avx2/; - specialize qw/aom_highbd_sad32x16 avx2/; - specialize qw/aom_highbd_sad16x32 avx2/; - specialize qw/aom_highbd_sad16x16 avx2/; - specialize qw/aom_highbd_sad16x8 avx2/; - - specialize qw/aom_highbd_sad128x128_avg avx2/; - specialize qw/aom_highbd_sad128x64_avg avx2/; - specialize qw/aom_highbd_sad64x128_avg avx2/; - specialize qw/aom_highbd_sad64x64_avg avx2/; - specialize qw/aom_highbd_sad64x32_avg avx2/; - specialize qw/aom_highbd_sad32x64_avg avx2/; - specialize qw/aom_highbd_sad32x32_avg avx2/; - specialize qw/aom_highbd_sad32x16_avg avx2/; - specialize qw/aom_highbd_sad16x32_avg avx2/; - specialize qw/aom_highbd_sad16x16_avg avx2/; - specialize qw/aom_highbd_sad16x8_avg avx2/; -} -# -# Masked SAD -# -if (aom_config("CONFIG_EXT_INTER") eq "yes") { + # + # Masked SAD + # foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask"; @@ -729,318 +863,326 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") { specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/; } } -} - -# -# OBMC SAD -# -if (aom_config("CONFIG_MOTION_VAR") eq "yes") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; - specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/; - } - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + # + # OBMC SAD + # + if (aom_config("CONFIG_MOTION_VAR") eq "yes") { foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; - specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/; + add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; + if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { + specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/; + } + } + + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; + if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { + specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/; + } + } } } -} -# -# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally -# -# Blocks of 3 -foreach $s (@block_widths) { - add_proto qw/void/, "aom_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -} -specialize qw/aom_sad64x64x3 msa/; -specialize qw/aom_sad32x32x3 msa/; -specialize qw/aom_sad16x16x3 sse3 ssse3 msa/; -specialize qw/aom_sad8x8x3 sse3 msa/; -specialize qw/aom_sad4x4x3 sse3 msa/; - -add_proto qw/void/, "aom_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad16x8x3 sse3 ssse3 msa/; -add_proto qw/void/, "aom_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad8x16x3 sse3 msa/; - -# Blocks of 8 -foreach $s (@block_widths) { - add_proto qw/void/, "aom_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -} -specialize qw/aom_sad64x64x8 msa/; -specialize qw/aom_sad32x32x8 msa/; -specialize qw/aom_sad16x16x8 sse4_1 msa/; -specialize qw/aom_sad8x8x8 sse4_1 msa/; -specialize qw/aom_sad4x4x8 sse4_1 msa/; - -add_proto qw/void/, "aom_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad16x8x8 sse4_1 msa/; -add_proto qw/void/, "aom_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad8x16x8 sse4_1 msa/; -add_proto qw/void/, "aom_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad8x4x8 msa/; -add_proto qw/void/, "aom_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/aom_sad4x8x8 msa/; + # + # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally + # + # Blocks of 3 + foreach $s (@block_widths) { + add_proto qw/void/, "aom_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + } + specialize qw/aom_sad64x64x3 msa/; + specialize qw/aom_sad32x32x3 msa/; + specialize qw/aom_sad16x16x3 sse3 ssse3 msa/; + specialize qw/aom_sad8x8x3 sse3 msa/; + specialize qw/aom_sad4x4x3 sse3 msa/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void/, "aom_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad16x8x3 sse3 ssse3 msa/; + add_proto qw/void/, "aom_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad8x16x3 sse3 msa/; + + # Blocks of 8 foreach $s (@block_widths) { + add_proto qw/void/, "aom_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + } + specialize qw/aom_sad64x64x8 msa/; + specialize qw/aom_sad32x32x8 msa/; + specialize qw/aom_sad16x16x8 sse4_1 msa/; + specialize qw/aom_sad8x8x8 sse4_1 msa/; + specialize qw/aom_sad4x4x8 sse4_1 msa/; + + add_proto qw/void/, "aom_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad16x8x8 sse4_1 msa/; + add_proto qw/void/, "aom_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad8x16x8 sse4_1 msa/; + add_proto qw/void/, "aom_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad8x4x8 msa/; + add_proto qw/void/, "aom_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + specialize qw/aom_sad4x8x8 msa/; + + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach $s (@block_widths) { + # Blocks of 3 + add_proto qw/void/, "aom_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + # Blocks of 8 + add_proto qw/void/, "aom_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + } # Blocks of 3 - add_proto qw/void/, "aom_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; # Blocks of 8 - add_proto qw/void/, "aom_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; + add_proto qw/void/, "aom_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; } - # Blocks of 3 - add_proto qw/void/, "aom_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - # Blocks of 8 - add_proto qw/void/, "aom_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -} - -# -# Multi-block SAD, comparing a reference to N independent blocks -# -foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; -} - -specialize qw/aom_sad128x128x4d avx2 sse2/; -specialize qw/aom_sad128x64x4d avx2 sse2/; -specialize qw/aom_sad64x128x4d avx2 sse2/; -specialize qw/aom_sad64x64x4d avx2 neon msa sse2/; -specialize qw/aom_sad64x32x4d avx2 msa sse2/; -specialize qw/aom_sad32x64x4d avx2 msa sse2/; -specialize qw/aom_sad32x32x4d avx2 neon msa sse2/; -specialize qw/aom_sad32x16x4d msa sse2/; -specialize qw/aom_sad16x32x4d msa sse2/; -specialize qw/aom_sad16x16x4d neon msa sse2/; -specialize qw/aom_sad16x8x4d msa sse2/; -specialize qw/aom_sad8x16x4d msa sse2/; -specialize qw/aom_sad8x8x4d msa sse2/; -specialize qw/aom_sad8x4x4d msa sse2/; -specialize qw/aom_sad4x8x4d msa sse2/; -specialize qw/aom_sad4x4x4d msa sse2/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { # # Multi-block SAD, comparing a reference to N independent blocks # foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; - if ($w != 128 && $h != 128) { - specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/; + add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; + } + + specialize qw/aom_sad128x128x4d avx2 sse2/; + specialize qw/aom_sad128x64x4d avx2 sse2/; + specialize qw/aom_sad64x128x4d avx2 sse2/; + specialize qw/aom_sad64x64x4d avx2 neon msa sse2/; + specialize qw/aom_sad64x32x4d avx2 msa sse2/; + specialize qw/aom_sad32x64x4d avx2 msa sse2/; + specialize qw/aom_sad32x32x4d avx2 neon msa sse2/; + specialize qw/aom_sad32x16x4d msa sse2/; + specialize qw/aom_sad16x32x4d msa sse2/; + specialize qw/aom_sad16x16x4d neon msa sse2/; + specialize qw/aom_sad16x8x4d msa sse2/; + specialize qw/aom_sad8x16x4d msa sse2/; + specialize qw/aom_sad8x8x4d msa sse2/; + specialize qw/aom_sad8x4x4d msa sse2/; + specialize qw/aom_sad4x8x4d msa sse2/; + specialize qw/aom_sad4x4x4d msa sse2/; + + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + # + # Multi-block SAD, comparing a reference to N independent blocks + # + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; + if ($w != 128 && $h != 128) { + specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/; + } } + specialize qw/aom_highbd_sad128x128x4d avx2/; + specialize qw/aom_highbd_sad128x64x4d avx2/; + specialize qw/aom_highbd_sad64x128x4d avx2/; + specialize qw/aom_highbd_sad64x64x4d avx2/; + specialize qw/aom_highbd_sad64x32x4d avx2/; + specialize qw/aom_highbd_sad32x64x4d avx2/; + specialize qw/aom_highbd_sad32x32x4d avx2/; + specialize qw/aom_highbd_sad32x16x4d avx2/; + specialize qw/aom_highbd_sad16x32x4d avx2/; + specialize qw/aom_highbd_sad16x16x4d avx2/; + specialize qw/aom_highbd_sad16x8x4d avx2/; } - specialize qw/aom_highbd_sad128x128x4d avx2/; - specialize qw/aom_highbd_sad128x64x4d avx2/; - specialize qw/aom_highbd_sad64x128x4d avx2/; - specialize qw/aom_highbd_sad64x64x4d avx2/; - specialize qw/aom_highbd_sad64x32x4d avx2/; - specialize qw/aom_highbd_sad32x64x4d avx2/; - specialize qw/aom_highbd_sad32x32x4d avx2/; - specialize qw/aom_highbd_sad32x16x4d avx2/; - specialize qw/aom_highbd_sad16x32x4d avx2/; - specialize qw/aom_highbd_sad16x16x4d avx2/; - specialize qw/aom_highbd_sad16x8x4d avx2/; -} -# -# Structured Similarity (SSIM) -# -if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") { - add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; - specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64"; + # + # Structured Similarity (SSIM) + # + if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") { + add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64"; - add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; - specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64"; + add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64"; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + } } -} } # CONFIG_AV1_ENCODER if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { -# -# Specialty Variance -# -add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + # + # Specialty Variance + # + add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/aom_get16x16var sse2 avx2 neon msa/; -specialize qw/aom_get8x8var sse2 neon msa/; + specialize qw/aom_get16x16var sse2 avx2 neon msa/; + specialize qw/aom_get8x8var sse2 neon msa/; -add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; -specialize qw/aom_mse16x16 sse2 avx2 neon msa/; -specialize qw/aom_mse16x8 sse2 msa/; -specialize qw/aom_mse8x16 sse2 msa/; -specialize qw/aom_mse8x8 sse2 msa/; + specialize qw/aom_mse16x16 sse2 avx2 neon msa/; + specialize qw/aom_mse16x8 sse2 msa/; + specialize qw/aom_mse8x16 sse2 msa/; + specialize qw/aom_mse8x8 sse2 msa/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach $bd (8, 10, 12) { - add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach $bd (8, 10, 12) { + add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize "aom_highbd_${bd}_mse16x16", qw/sse2/; - specialize "aom_highbd_${bd}_mse8x8", qw/sse2/; + specialize "aom_highbd_${bd}_mse16x16", qw/sse2/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2/; + } } -} -# -# ... -# -add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; -specialize qw/aom_upsampled_pred sse2/; -add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; -specialize qw/aom_comp_avg_upsampled_pred sse2/; + # + # ... + # + add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; + specialize qw/aom_upsampled_pred sse2/; + add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; + specialize qw/aom_comp_avg_upsampled_pred sse2/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; - specialize qw/aom_highbd_upsampled_pred sse2/; - add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; - specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; -} + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; + specialize qw/aom_highbd_upsampled_pred sse2/; + add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; + specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; + } -# -# ... -# -add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *"; -add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; + # + # ... + # + add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *"; + add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; -specialize qw/aom_get_mb_ss sse2 msa/; -specialize qw/aom_get4x4sse_cs neon msa/; + specialize qw/aom_get_mb_ss sse2 msa/; + specialize qw/aom_get4x4sse_cs neon msa/; -# -# Variance / Subpixel Variance / Subpixel Avg Variance -# + # + # Variance / Subpixel Variance / Subpixel Avg Variance + # add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; -} + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + } -specialize qw/aom_variance64x64 sse2 avx2 neon msa/; -specialize qw/aom_variance64x32 sse2 avx2 neon msa/; -specialize qw/aom_variance32x64 sse2 neon msa/; -specialize qw/aom_variance32x32 sse2 avx2 neon msa/; -specialize qw/aom_variance32x16 sse2 avx2 msa/; -specialize qw/aom_variance16x32 sse2 msa/; -specialize qw/aom_variance16x16 sse2 avx2 neon msa/; -specialize qw/aom_variance16x8 sse2 neon msa/; -specialize qw/aom_variance8x16 sse2 neon msa/; -specialize qw/aom_variance8x8 sse2 neon msa/; -specialize qw/aom_variance8x4 sse2 msa/; -specialize qw/aom_variance4x8 sse2 msa/; -specialize qw/aom_variance4x4 sse2 msa/; - -specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance8x8 neon msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/; - -specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; -specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; - -if (aom_config("CONFIG_EXT_PARTITION_TYPES")) { - specialize qw/aom_variance4x16 sse2/; - specialize qw/aom_variance16x4 sse2/; - specialize qw/aom_variance8x32 sse2/; - specialize qw/aom_variance32x8 sse2/; - specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/; -} + specialize qw/aom_variance64x64 sse2 avx2 neon msa/; + specialize qw/aom_variance64x32 sse2 avx2 neon msa/; + specialize qw/aom_variance32x64 sse2 neon msa/; + specialize qw/aom_variance32x32 sse2 avx2 neon msa/; + specialize qw/aom_variance32x16 sse2 avx2 msa/; + specialize qw/aom_variance16x32 sse2 msa/; + specialize qw/aom_variance16x16 sse2 avx2 neon msa/; + specialize qw/aom_variance16x8 sse2 neon msa/; + specialize qw/aom_variance8x16 sse2 neon msa/; + specialize qw/aom_variance8x8 sse2 neon msa/; + specialize qw/aom_variance8x4 sse2 msa/; + specialize qw/aom_variance4x8 sse2 msa/; + specialize qw/aom_variance4x4 sse2 msa/; + + specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x8 neon msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach $bd (8, 10, 12) { - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; + + if (aom_config("CONFIG_EXT_PARTITION_TYPES") eq "yes") { + specialize qw/aom_variance4x16 sse2/; + specialize qw/aom_variance16x4 sse2/; + specialize qw/aom_variance8x32 sse2/; + specialize qw/aom_variance32x8 sse2/; + specialize qw/aom_variance16x64 sse2/; + specialize qw/aom_variance64x16 sse2/; + specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/; + } - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach $bd (8, 10, 12) { + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - if ($w != 128 && $h != 128 && $w != 4 && $h != 4) { - specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2"; - } - # TODO(david.barker): When ext-partition-types is enabled, we currenly - # don't have vectorized 4x16 highbd variance functions - if ($w == 4 && $h == 4) { - specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1"; - } - if ($w != 128 && $h != 128 && $w != 4) { - specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/; - specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/; - } - if ($w == 4 && $h == 4) { - specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1"; - specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1"; + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + if ($w != 128 && $h != 128 && $w != 4 && $h != 4) { + specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2"; + } + # TODO(david.barker): When ext-partition-types is enabled, we currently + # don't have vectorized 4x16 highbd variance functions + if ($w == 4 && $h == 4) { + specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1"; + } + if ($w != 128 && $h != 128 && $w != 4) { + specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/; + } + if ($w == 4 && $h == 4) { + specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1"; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1"; + } } } - } -} # CONFIG_HIGHBITDEPTH + } # CONFIG_HIGHBITDEPTH -if (aom_config("CONFIG_EXT_INTER") eq "yes") { -# -# Masked Variance / Masked Subpixel Variance -# + # + # Masked Variance / Masked Subpixel Variance + # foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse"; @@ -1056,453 +1198,450 @@ if (aom_config("CONFIG_EXT_INTER") eq "yes") { } } } -} -# -# OBMC Variance / OBMC Subpixel Variance -# -if (aom_config("CONFIG_MOTION_VAR") eq "yes") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - specialize "aom_obmc_variance${w}x${h}", q/sse4_1/; - } + # + # OBMC Variance / OBMC Subpixel Variance + # + if (aom_config("CONFIG_MOTION_VAR") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + specialize "aom_obmc_variance${w}x${h}", q/sse4_1/; + } - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach $bd ("_", "_10_", "_12_") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach $bd ("_", "_10_", "_12_") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/; + } } } } -} -add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; -add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; -# -# Specialty Subpixel -# -add_proto qw/uint32_t aom_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; + # + # Specialty Subpixel + # + add_proto qw/uint32_t aom_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_variance_halfpixvar16x16_h sse2/; -add_proto qw/uint32_t aom_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_variance_halfpixvar16x16_v sse2/; -add_proto qw/uint32_t aom_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_variance_halfpixvar16x16_hv sse2/; -# -# Comp Avg -# -add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance64x64 sse2/; + # + # Comp Avg + # + add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; + if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance64x64 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance64x32 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance64x32 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x64 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance32x64 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x32 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance32x32 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance32x16 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance32x16 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x32 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance16x32 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x16 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance16x16 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance16x8 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance16x8 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance8x16 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance8x16 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_variance8x8 sse2/; + add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance8x8 sse2/; - add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x64 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance64x64 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x32 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance64x32 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x64 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance32x64 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x32 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance32x32 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x16 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance32x16 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x32 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance16x32 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x16 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance16x16 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x8 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance16x8 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance8x16 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance8x16 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance8x8 sse2/; + add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance8x8 sse2/; - add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance64x64 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance64x64 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance64x32 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance64x32 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x64 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance32x64 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x32 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance32x32 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance32x16 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance32x16 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x32 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance16x32 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x16 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance16x16 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance16x8 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance16x8 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance8x16 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance8x16 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_variance8x8 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance8x8 sse2/; - add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; + add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_mse16x16 sse2/; + add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_mse16x16 sse2/; - add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_8_mse8x8 sse2/; + add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_mse8x8 sse2/; - add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_mse16x16 sse2/; + add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_mse16x16 sse2/; - add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_mse8x8 sse2/; + add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_mse8x8 sse2/; - add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_mse16x16 sse2/; + add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_mse16x16 sse2/; - add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/aom_highbd_12_mse8x8 sse2/; + add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_mse8x8 sse2/; - add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; - # - # Subpixel Variance - # - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/; + # + # Subpixel Variance + # + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; -} # CONFIG_HIGHBITDEPTH + } # CONFIG_HIGHBITDEPTH -if (aom_config("CONFIG_EXT_INTER") eq "yes") { add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd"; } -} } # CONFIG_AV1_ENCODER diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c index 2dc5b2e56..7d5f64004 100644 --- a/third_party/aom/aom_dsp/arm/intrapred_neon.c +++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c @@ -529,229 +529,4 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, } } } - -void aom_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint16x8_t q1u16, q3u16; - int16x8_t q1s16; - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d2u32 = vdup_n_u32(0); - - d0u8 = vld1_dup_u8(above - 1); - d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); - q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); - for (i = 0; i < 4; i++, dst += stride) { - q1u16 = vdupq_n_u16((uint16_t)left[i]); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); - d0u8 = vqmovun_s16(q1s16); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - } -} - -void aom_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j; - uint16x8_t q0u16, q3u16, q10u16; - int16x8_t q0s16; - uint16x4_t d20u16; - uint8x8_t d0u8, d2u8, d30u8; - - d0u8 = vld1_dup_u8(above - 1); - d30u8 = vld1_u8(left); - d2u8 = vld1_u8(above); - q10u16 = vmovl_u8(d30u8); - q3u16 = vsubl_u8(d2u8, d0u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 1); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 2); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 3); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - } -} - -void aom_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; - uint8x16_t q0u8, q1u8; - int16x8_t q0s16, q1s16, q8s16, q11s16; - uint16x4_t d20u16; - uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; - - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - for (k = 0; k < 2; k++, left += 8) { - d18u8 = vld1_u8(left); - q10u16 = vmovl_u8(d18u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q8u16 = vdupq_lane_u16(d20u16, 1); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); - q11s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); - q8s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d20u16, 2); - q8u16 = vdupq_lane_u16(d20u16, 3); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); - q11s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); - q8s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - } - } -} - -void aom_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; - uint8x16_t q0u8, q1u8, q2u8; - int16x8_t q12s16, q13s16, q14s16, q15s16; - uint16x4_t d6u16; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; - - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u8 = vld1q_u8(above + 16); - q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); - q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); - for (k = 0; k < 4; k++, left += 8) { - d26u8 = vld1_u8(left); - q3u16 = vmovl_u8(d26u8); - d6u16 = vget_low_u16(q3u16); - for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { - q0u16 = vdupq_lane_u16(d6u16, 0); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 1); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 2); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 3); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - } - } -} #endif // !HAVE_NEON_ASM diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm index 7d04d3553..fba9c1b5b 100644 --- a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm +++ b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm @@ -19,10 +19,6 @@ EXPORT |aom_h_predictor_8x8_neon| EXPORT |aom_h_predictor_16x16_neon| EXPORT |aom_h_predictor_32x32_neon| - EXPORT |aom_tm_predictor_4x4_neon| - EXPORT |aom_tm_predictor_8x8_neon| - EXPORT |aom_tm_predictor_16x16_neon| - EXPORT |aom_tm_predictor_32x32_neon| ARM REQUIRE8 PRESERVE8 @@ -289,345 +285,3 @@ loop_h bgt loop_h bx lr ENDP ; |aom_h_predictor_32x32_neon| - -;void aom_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_tm_predictor_4x4_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.u8 {d0[]}, [r12] - - ; Load above 4 pixels - vld1.32 {d2[0]}, [r2] - - ; Compute above - ytop_left - vsubl.u8 q3, d2, d0 - - ; Load left row by row and compute left + (above - ytop_left) - ; 1st row and 2nd row - vld1.u8 {d2[]}, [r3]! - vld1.u8 {d4[]}, [r3]! - vmovl.u8 q1, d2 - vmovl.u8 q2, d4 - vadd.s16 q1, q1, q3 - vadd.s16 q2, q2, q3 - vqmovun.s16 d0, q1 - vqmovun.s16 d1, q2 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d1[0]}, [r0], r1 - - ; 3rd row and 4th row - vld1.u8 {d2[]}, [r3]! - vld1.u8 {d4[]}, [r3] - vmovl.u8 q1, d2 - vmovl.u8 q2, d4 - vadd.s16 q1, q1, q3 - vadd.s16 q2, q2, q3 - vqmovun.s16 d0, q1 - vqmovun.s16 d1, q2 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d1[0]}, [r0], r1 - bx lr - ENDP ; |aom_tm_predictor_4x4_neon| - -;void aom_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_tm_predictor_8x8_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.8 {d0[]}, [r12] - - ; preload 8 left - vld1.8 {d30}, [r3] - - ; Load above 8 pixels - vld1.64 {d2}, [r2] - - vmovl.u8 q10, d30 - - ; Compute above - ytop_left - vsubl.u8 q3, d2, d0 - - ; Load left row by row and compute left + (above - ytop_left) - ; 1st row and 2nd row - vdup.16 q0, d20[0] - vdup.16 q1, d20[1] - vadd.s16 q0, q3, q0 - vadd.s16 q1, q3, q1 - - ; 3rd row and 4th row - vdup.16 q8, d20[2] - vdup.16 q9, d20[3] - vadd.s16 q8, q3, q8 - vadd.s16 q9, q3, q9 - - vqmovun.s16 d0, q0 - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q8 - vqmovun.s16 d3, q9 - - vst1.64 {d0}, [r0], r1 - vst1.64 {d1}, [r0], r1 - vst1.64 {d2}, [r0], r1 - vst1.64 {d3}, [r0], r1 - - ; 5th row and 6th row - vdup.16 q0, d21[0] - vdup.16 q1, d21[1] - vadd.s16 q0, q3, q0 - vadd.s16 q1, q3, q1 - - ; 7th row and 8th row - vdup.16 q8, d21[2] - vdup.16 q9, d21[3] - vadd.s16 q8, q3, q8 - vadd.s16 q9, q3, q9 - - vqmovun.s16 d0, q0 - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q8 - vqmovun.s16 d3, q9 - - vst1.64 {d0}, [r0], r1 - vst1.64 {d1}, [r0], r1 - vst1.64 {d2}, [r0], r1 - vst1.64 {d3}, [r0], r1 - - bx lr - ENDP ; |aom_tm_predictor_8x8_neon| - -;void aom_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_tm_predictor_16x16_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.8 {d0[]}, [r12] - - ; Load above 8 pixels - vld1.8 {q1}, [r2] - - ; preload 8 left into r12 - vld1.8 {d18}, [r3]! - - ; Compute above - ytop_left - vsubl.u8 q2, d2, d0 - vsubl.u8 q3, d3, d0 - - vmovl.u8 q10, d18 - - ; Load left row by row and compute left + (above - ytop_left) - ; Process 8 rows in each single loop and loop 2 times to process 16 rows. - mov r2, #2 - -loop_16x16_neon - ; Process two rows. - vdup.16 q0, d20[0] - vdup.16 q8, d20[1] - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vdup.16 q0, d20[2] ; proload next 2 rows data - vdup.16 q8, d20[3] - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - ; Process two rows. - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vdup.16 q0, d21[0] ; proload next 2 rows data - vdup.16 q8, d21[1] - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vdup.16 q0, d21[2] ; proload next 2 rows data - vdup.16 q8, d21[3] - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vld1.8 {d18}, [r3]! ; preload 8 left into r12 - vmovl.u8 q10, d18 - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - subs r2, r2, #1 - bgt loop_16x16_neon - - bx lr - ENDP ; |aom_tm_predictor_16x16_neon| - -;void aom_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_tm_predictor_32x32_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.8 {d0[]}, [r12] - - ; Load above 32 pixels - vld1.8 {q1}, [r2]! - vld1.8 {q2}, [r2] - - ; preload 8 left pixels - vld1.8 {d26}, [r3]! - - ; Compute above - ytop_left - vsubl.u8 q8, d2, d0 - vsubl.u8 q9, d3, d0 - vsubl.u8 q10, d4, d0 - vsubl.u8 q11, d5, d0 - - vmovl.u8 q3, d26 - - ; Load left row by row and compute left + (above - ytop_left) - ; Process 8 rows in each single loop and loop 4 times to process 32 rows. - mov r2, #4 - -loop_32x32_neon - ; Process two rows. - vdup.16 q0, d6[0] - vdup.16 q2, d6[1] - vadd.s16 q12, q0, q8 - vadd.s16 q13, q0, q9 - vadd.s16 q14, q0, q10 - vadd.s16 q15, q0, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vdup.16 q1, d6[2] - vdup.16 q2, d6[3] - vst1.64 {d24-d27}, [r0], r1 - - ; Process two rows. - vadd.s16 q12, q1, q8 - vadd.s16 q13, q1, q9 - vadd.s16 q14, q1, q10 - vadd.s16 q15, q1, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vdup.16 q0, d7[0] - vdup.16 q2, d7[1] - vst1.64 {d24-d27}, [r0], r1 - - ; Process two rows. - vadd.s16 q12, q0, q8 - vadd.s16 q13, q0, q9 - vadd.s16 q14, q0, q10 - vadd.s16 q15, q0, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vdup.16 q0, d7[2] - vdup.16 q2, d7[3] - vst1.64 {d24-d27}, [r0], r1 - - ; Process two rows. - vadd.s16 q12, q0, q8 - vadd.s16 q13, q0, q9 - vadd.s16 q14, q0, q10 - vadd.s16 q15, q0, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vld1.8 {d0}, [r3]! ; preload 8 left pixels - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vmovl.u8 q3, d0 - vst1.64 {d24-d27}, [r0], r1 - - subs r2, r2, #1 - bgt loop_32x32_neon - - bx lr - ENDP ; |aom_tm_predictor_32x32_neon| - - END diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c index bf304dada..4f38afbc5 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.c +++ b/third_party/aom/aom_dsp/binary_codes_reader.c @@ -53,6 +53,15 @@ uint16_t aom_read_primitive_quniform_(aom_reader *r, return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME); } +static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb, + uint16_t n) { + if (n <= 1) return 0; + const int l = get_msb(n - 1) + 1; + const int m = (1 << l) - n; + const int v = aom_rb_read_literal(rb, l - 1); + return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb); +} + uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p, uint16_t ref ACCT_STR_PARAM) { if (n <= 1) return 0; @@ -101,15 +110,42 @@ uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, return v; } -// Decode finite subexponential code that for a symbol v in [0, n-1] with -// parameter k -// based on a reference ref also in [0, n-1]. +static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb, + uint16_t n, uint16_t k) { + int i = 0; + int mk = 0; + uint16_t v; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + v = aom_rb_read_primitive_quniform(rb, n - mk) + mk; + break; + } else { + if (aom_rb_read_bit(rb)) { + i = i + 1; + mk += a; + } else { + v = aom_rb_read_literal(rb, b) + mk; + break; + } + } + } + return v; +} + uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, uint16_t ref ACCT_STR_PARAM) { return inv_recenter_finite_nonneg( n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME)); } +static uint16_t aom_rb_read_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) { + return inv_recenter_finite_nonneg(n, ref, + aom_rb_read_primitive_subexpfin(rb, n, k)); +} + // Decode finite subexponential code that for a symbol v in [-(n-1), n-1] with // parameter k based on a reference ref also in [-(n-1), n-1]. int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n, @@ -120,3 +156,10 @@ int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n, return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref, ACCT_STR_NAME) - n + 1; } + +int16_t aom_rb_read_signed_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) { + ref += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1; +} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h index 1540cf46b..8885142c9 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.h +++ b/third_party/aom/aom_dsp/binary_codes_reader.h @@ -17,9 +17,11 @@ extern "C" { #endif #include + #include "./aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/bitreader.h" +#include "aom_dsp/bitreader_buffer.h" #define aom_read_primitive_symmetric(r, n, ACCT_STR_NAME) \ aom_read_primitive_symmetric_(r, n ACCT_STR_ARG(ACCT_STR_NAME)) @@ -47,6 +49,9 @@ uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, int16_t ref ACCT_STR_PARAM); + +int16_t aom_rb_read_signed_primitive_refsubexpfin( + struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref); #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c index 91e807b29..e092b6278 100644 --- a/third_party/aom/aom_dsp/binary_codes_writer.c +++ b/third_party/aom/aom_dsp/binary_codes_writer.c @@ -10,6 +10,7 @@ */ #include "aom_dsp/bitwriter.h" +#include "aom_dsp/binary_codes_writer.h" #include "av1/common/common.h" @@ -68,6 +69,19 @@ void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { } } +static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t v) { + if (n <= 1) return; + const int l = get_msb(n - 1) + 1; + const int m = (1 << l) - n; + if (v < m) { + aom_wb_write_literal(wb, v, l - 1); + } else { + aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1); + aom_wb_write_bit(wb, (v - m) & 1); + } +} + int aom_count_primitive_quniform(uint16_t n, uint16_t v) { if (n <= 1) return 0; const int l = get_msb(n - 1) + 1; @@ -155,6 +169,31 @@ void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, } } +static void aom_wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + uint16_t v) { + int i = 0; + int mk = 0; + while (1) { + int b = (i ? k + i - 1 : k); + int a = (1 << b); + if (n <= mk + 3 * a) { + aom_wb_write_primitive_quniform(wb, n - mk, v - mk); + break; + } else { + int t = (v >= mk + a); + aom_wb_write_bit(wb, t); + if (t) { + i = i + 1; + mk += a; + } else { + aom_wb_write_literal(wb, v - mk, b); + break; + } + } + } +} + int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) { int count = 0; int i = 0; @@ -184,19 +223,34 @@ int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) { // based on a reference ref also in [0, n-1]. // Recenters symbol around r first and then uses a finite subexponential code. void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, - int16_t ref, int16_t v) { + uint16_t ref, uint16_t v) { aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v)); } +static void aom_wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + uint16_t ref, uint16_t v) { + aom_wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v)); +} + void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, - uint16_t k, uint16_t ref, - uint16_t v) { + uint16_t k, int16_t ref, + int16_t v) { ref += n - 1; v += n - 1; const uint16_t scaled_n = (n << 1) - 1; aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v); } +void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + int16_t ref, int16_t v) { + ref += n - 1; + v += n - 1; + const uint16_t scaled_n = (n << 1) - 1; + aom_wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v); +} + int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, uint16_t v) { return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v)); diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h index ab5ccbf15..18ad5078f 100644 --- a/third_party/aom/aom_dsp/binary_codes_writer.h +++ b/third_party/aom/aom_dsp/binary_codes_writer.h @@ -20,6 +20,7 @@ extern "C" { #include "./aom_config.h" #include "aom/aom_integer.h" #include "aom_dsp/bitwriter.h" +#include "aom_dsp/bitwriter_buffer.h" // Codes a symbol v in [-2^mag_bits, 2^mag_bits] // mag_bits is number of bits for magnitude. The alphabet is of size @@ -53,6 +54,10 @@ void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k, int16_t ref, int16_t v); +void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, + uint16_t n, uint16_t k, + int16_t ref, int16_t v); + // Functions that counts bits for the above primitives int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits); int aom_count_primitive_quniform(uint16_t n, uint16_t v); diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h index 88bedccc2..00424fa76 100644 --- a/third_party/aom/aom_dsp/bitreader.h +++ b/third_party/aom/aom_dsp/bitreader.h @@ -50,6 +50,11 @@ #define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \ aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) +#if CONFIG_LV_MAP +#define aom_read_bin(r, cdf, nsymbs, ACCT_STR_NAME) \ + aom_read_bin_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) +#endif + #ifdef __cplusplus extern "C" { #endif @@ -198,6 +203,16 @@ static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, return ret; } +#if CONFIG_LV_MAP +static INLINE int aom_read_bin_(aom_reader *r, aom_cdf_prob *cdf, + int nsymbs ACCT_STR_PARAM) { + int ret; + ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); + update_cdf(cdf, ret, nsymbs); + return ret; +} +#endif + static INLINE int aom_read_tree_as_cdf(aom_reader *r, const aom_tree_index *tree, const aom_prob *probs) { diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h index 68bc1c8f8..7d3b34306 100644 --- a/third_party/aom/aom_dsp/bitwriter.h +++ b/third_party/aom/aom_dsp/bitwriter.h @@ -62,9 +62,8 @@ static INLINE void init_token_stats(TOKEN_STATS *token_stats) { static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) { #if CONFIG_ANS - (void)bc; - (void)buffer; - assert(0 && "buf_ans requires a more complicated startup procedure"); + aom_buf_ans_alloc(bc, /* error context*/ NULL); + buf_ans_write_init(bc, buffer); #else aom_daala_start_encode(bc, buffer); #endif @@ -72,8 +71,8 @@ static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) { static INLINE void aom_stop_encode(aom_writer *bc) { #if CONFIG_ANS - (void)bc; - assert(0 && "buf_ans requires a more complicated shutdown procedure"); + aom_buf_ans_flush(bc); + bc->pos = buf_ans_write_end(bc); #else aom_daala_stop_encode(bc); #endif @@ -143,6 +142,14 @@ static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, update_cdf(cdf, symb, nsymbs); } +#if CONFIG_LV_MAP +static INLINE void aom_write_bin(aom_writer *w, int symb, aom_cdf_prob *cdf, + int nsymbs) { + aom_write_cdf(w, symb, cdf, nsymbs); + update_cdf(cdf, symb, nsymbs); +} +#endif + static INLINE void aom_write_tree_as_cdf(aom_writer *w, const aom_tree_index *tree, const aom_prob *probs, int bits, diff --git a/third_party/aom/aom_dsp/buf_ans.c b/third_party/aom/aom_dsp/buf_ans.c index 8fe1ff763..f7703dffc 100644 --- a/third_party/aom/aom_dsp/buf_ans.c +++ b/third_party/aom/aom_dsp/buf_ans.c @@ -16,9 +16,8 @@ #include "aom/internal/aom_codec_internal.h" void aom_buf_ans_alloc(struct BufAnsCoder *c, - struct aom_internal_error_info *error, int size) { + struct aom_internal_error_info *error) { c->error = error; - c->size = size; assert(c->size > 1); AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf))); // Initialize to overfull to trigger the assert in write. diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h index 0768506b3..f84ff3aed 100644 --- a/third_party/aom/aom_dsp/buf_ans.h +++ b/third_party/aom/aom_dsp/buf_ans.h @@ -46,6 +46,7 @@ struct BufAnsCoder { #if ANS_MAX_SYMBOLS int window_size; #endif + int pos; // Dummy variable to store the output buffer after closing }; // Allocate a buffered ANS coder to store size symbols. @@ -54,7 +55,7 @@ struct BufAnsCoder { // When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the // buffer will grow on demand void aom_buf_ans_alloc(struct BufAnsCoder *c, - struct aom_internal_error_info *error, int hint); + struct aom_internal_error_info *error); void aom_buf_ans_free(struct BufAnsCoder *c); diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c index 0fc7b14a5..c6e3ac82d 100644 --- a/third_party/aom/aom_dsp/daalaboolreader.c +++ b/third_party/aom/aom_dsp/daalaboolreader.c @@ -17,7 +17,7 @@ int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) { } r->buffer_end = buffer + size; r->buffer = buffer; - od_ec_dec_init(&r->ec, buffer, size - 1); + od_ec_dec_init(&r->ec, buffer, size); #if CONFIG_ACCOUNTING r->accounting = NULL; #endif diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h index 428d74db0..55ff8d3d5 100644 --- a/third_party/aom/aom_dsp/daalaboolreader.h +++ b/third_party/aom/aom_dsp/daalaboolreader.h @@ -45,11 +45,7 @@ uint32_t aom_daala_reader_tell_frac(const daala_reader *r); static INLINE int aom_daala_read(daala_reader *r, int prob) { int bit; -#if CONFIG_EC_SMALLMUL int p = (0x7FFFFF - (prob << 15) + prob) >> 8; -#else - int p = ((prob << 15) + 256 - prob) >> 8; -#endif #if CONFIG_BITSTREAM_DEBUG /*{ const int queue_r = bitstream_queue_get_read(); @@ -113,6 +109,7 @@ static INLINE int aom_daala_reader_has_error(daala_reader *r) { static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf, int nsymbs) { int symb; + assert(cdf != NULL); symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs); #if CONFIG_BITSTREAM_DEBUG diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c index 0ba8f6ab8..59af2a243 100644 --- a/third_party/aom/aom_dsp/daalaboolwriter.c +++ b/third_party/aom/aom_dsp/daalaboolwriter.c @@ -24,9 +24,5 @@ void aom_daala_stop_encode(daala_writer *br) { daala_data = od_ec_enc_done(&br->ec, &daala_bytes); memcpy(br->buffer, daala_data, daala_bytes); br->pos = daala_bytes; - /* Prevent ec bitstream from being detected as a superframe marker. - Must always be added, so that rawbits knows the exact length of the - bitstream. */ - br->buffer[br->pos++] = 0; od_ec_enc_clear(&br->ec); } diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h index bbaf53c69..6ec0f0b54 100644 --- a/third_party/aom/aom_dsp/daalaboolwriter.h +++ b/third_party/aom/aom_dsp/daalaboolwriter.h @@ -36,11 +36,7 @@ void aom_daala_start_encode(daala_writer *w, uint8_t *buffer); void aom_daala_stop_encode(daala_writer *w); static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) { -#if CONFIG_EC_SMALLMUL int p = (0x7FFFFF - (prob << 15) + prob) >> 8; -#else - int p = ((prob << 15) + 256 - prob) >> 8; -#endif #if CONFIG_BITSTREAM_DEBUG aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 }; /*int queue_r = 0; diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h index 534959e66..981a951e6 100644 --- a/third_party/aom/aom_dsp/entcode.h +++ b/third_party/aom/aom_dsp/entcode.h @@ -28,15 +28,11 @@ typedef uint32_t od_ec_window; 3 => 1/8th bits.*/ #define OD_BITRES (3) -/*With CONFIG_EC_SMALLMUL, the value stored in a CDF is 32768 minus the actual - Q15 cumulative probability (an "inverse" CDF). +/*The value stored in an iCDF is 32768 minus the actual Q15 cumulative + probability (an "inverse" CDF). This function converts from one representation to the other (and is its own inverse).*/ -#if CONFIG_EC_SMALLMUL #define OD_ICDF(x) (32768U - (x)) -#else -#define OD_ICDF(x) (x) -#endif /*See entcode.c for further documentation.*/ diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c index 49b176cd8..71dad0df6 100644 --- a/third_party/aom/aom_dsp/entdec.c +++ b/third_party/aom/aom_dsp/entdec.c @@ -114,12 +114,8 @@ static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, OD_ASSERT(rng <= 65535U); d = 16 - OD_ILOG_NZ(rng); dec->cnt -= d; -#if CONFIG_EC_SMALLMUL /*This is equivalent to shifting in 1's instead of 0's.*/ dec->dif = ((dif + 1) << d) - 1; -#else - dec->dif = dif << d; -#endif dec->rng = rng << d; if (dec->cnt < 0) od_ec_dec_refill(dec); return ret; @@ -137,11 +133,7 @@ void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8); dec->end = buf + storage; dec->bptr = buf; -#if CONFIG_EC_SMALLMUL dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1; -#else - dec->dif = 0; -#endif dec->rng = 0x8000; dec->cnt = -15; dec->error = 0; @@ -149,8 +141,7 @@ void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, } /*Decode a single binary value. - {EC_SMALLMUL} f: The probability that the bit is one, scaled by 32768. - {else} f: The probability that the bit is zero, scaled by 32768. + f: The probability that the bit is one, scaled by 32768. Return: The value decoded (0 or 1).*/ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { od_ec_window dif; @@ -165,7 +156,6 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { r = dec->rng; OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r); OD_ASSERT(32768U <= r); -#if CONFIG_EC_SMALLMUL v = (r >> 8) * (uint32_t)f >> 7; vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); ret = 1; @@ -175,30 +165,19 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { dif -= vw; ret = 0; } -#else - v = f * (uint32_t)r >> 15; - vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); - ret = 0; - r_new = v; - if (dif >= vw) { - r_new = r - v; - dif -= vw; - ret = 1; - } -#endif return od_ec_dec_normalize(dec, dif, r_new, ret); } -/*Decodes a symbol given a cumulative distribution function (CDF) table in Q15. - cdf: The CDF, such that symbol s falls in the range - [s > 0 ? cdf[s - 1] : 0, cdf[s]). - The values must be monotonically non-increasing, and cdf[nsyms - 1] - must be 32768. - {EC_SMALLMUL}: The CDF contains 32768 minus those values. +/*Decodes a symbol given an inverse cumulative distribution function (CDF) + table in Q15. + icdf: 32768 minus the CDF, such that symbol s falls in the range + [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]). + The values must be monotonically non-increasing, and icdf[nsyms - 1] + must be 0. nsyms: The number of symbols in the alphabet. This should be at most 16. Return: The decoded symbol s.*/ -int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) { +int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) { od_ec_window dif; unsigned r; unsigned c; @@ -209,33 +188,19 @@ int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *cdf, int nsyms) { dif = dec->dif; r = dec->rng; OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r); - OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U)); + OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U)); OD_ASSERT(32768U <= r); -#if CONFIG_EC_SMALLMUL c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); v = r; ret = -1; do { u = v; - v = (r >> 8) * (uint32_t)cdf[++ret] >> 7; + v = (r >> 8) * (uint32_t)icdf[++ret] >> 7; } while (c < v); OD_ASSERT(v < u); OD_ASSERT(u <= r); r = u - v; dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); -#else - c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); - v = 0; - ret = -1; - do { - u = v; - v = cdf[++ret] * (uint32_t)r >> 15; - } while (v <= c); - OD_ASSERT(u < v); - OD_ASSERT(v <= r); - r = v - u; - dif -= (od_ec_window)u << (OD_EC_WINDOW_SIZE - 16); -#endif return od_ec_dec_normalize(dec, dif, r, ret); } diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h index e1145e81d..35ac7fe0d 100644 --- a/third_party/aom/aom_dsp/entdec.h +++ b/third_party/aom/aom_dsp/entdec.h @@ -47,10 +47,8 @@ struct od_ec_dec { const unsigned char *end; /*The read pointer for the entropy-coded bits.*/ const unsigned char *bptr; - /*The difference between the coded value and the low end of the current - range. - {EC_SMALLMUL} The difference between the high end of the current range, - (low + rng), and the coded value, minus 1. + /*The difference between the high end of the current range, (low + rng), and + the coded value, minus 1. This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the decoder only uses the top 16 bits of the window to decode the next symbol. As we shift up during renormalization, if we don't have enough bits left in diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c index a350f27f4..b8c4dc047 100644 --- a/third_party/aom/aom_dsp/entenc.c +++ b/third_party/aom/aom_dsp/entenc.c @@ -143,11 +143,10 @@ void od_ec_enc_clear(od_ec_enc *enc) { } /*Encodes a symbol given its frequency in Q15. - fl: The cumulative frequency of all symbols that come before the one to be - encoded. - fh: The cumulative frequency of all symbols up to and including the one to - be encoded. - {EC_SMALLMUL} Both values are 32768 minus that.*/ + fl: 32768 minus the cumulative frequency of all symbols that come before the + one to be encoded. + fh: 32768 minus the cumulative frequency of all symbols up to and including + the one to be encoded.*/ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { od_ec_window l; unsigned r; @@ -156,7 +155,6 @@ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { l = enc->low; r = enc->rng; OD_ASSERT(32768U <= r); -#if CONFIG_EC_SMALLMUL OD_ASSERT(fh < fl); OD_ASSERT(fl <= 32768U); if (fl < 32768U) { @@ -167,14 +165,6 @@ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { } else { r -= (r >> 8) * (uint32_t)fh >> 7; } -#else - OD_ASSERT(fl < fh); - OD_ASSERT(fh <= 32768U); - u = fl * (uint32_t)r >> 15; - v = fh * (uint32_t)r >> 15; - r = v - u; - l += u; -#endif od_ec_enc_normalize(enc, l, r); #if OD_MEASURE_EC_OVERHEAD enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / 32768.); @@ -184,8 +174,7 @@ static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { /*Encode a single binary value. val: The value to encode (0 or 1). - {EC_SMALLMUL} f: The probability that the val is one, scaled by 32768. - {else} f: The probability that val is zero, scaled by 32768.*/ + f: The probability that the val is one, scaled by 32768.*/ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { od_ec_window l; unsigned r; @@ -195,15 +184,9 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { l = enc->low; r = enc->rng; OD_ASSERT(32768U <= r); -#if CONFIG_EC_SMALLMUL v = (r >> 8) * (uint32_t)f >> 7; if (val) l += r - v; r = val ? v : r - v; -#else - v = f * (uint32_t)r >> 15; - if (val) l += v; - r = val ? r - v : v; -#endif od_ec_enc_normalize(enc, l, r); #if OD_MEASURE_EC_OVERHEAD enc->entropy -= @@ -214,19 +197,19 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { /*Encodes a symbol given a cumulative distribution function (CDF) table in Q15. s: The index of the symbol to encode. - cdf: The CDF, such that symbol s falls in the range - [s > 0 ? cdf[s - 1] : 0, cdf[s]). - The values must be monotonically non-decreasing, and the last value - must be exactly 32768. + icdf: 32768 minus the CDF, such that symbol s falls in the range + [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]). + The values must be monotonically decreasing, and icdf[nsyms - 1] must + be 0. nsyms: The number of symbols in the alphabet. This should be at most 16.*/ -void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, +void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf, int nsyms) { (void)nsyms; OD_ASSERT(s >= 0); OD_ASSERT(s < nsyms); - OD_ASSERT(cdf[nsyms - 1] == OD_ICDF(32768U)); - od_ec_encode_q15(enc, s > 0 ? cdf[s - 1] : OD_ICDF(0), cdf[s]); + OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U)); + od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s]); } #if CONFIG_RAWBITS diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c index b4d47ae89..6d2ac37d9 100644 --- a/third_party/aom/aom_dsp/intrapred.c +++ b/third_party/aom/aom_dsp/intrapred.c @@ -16,6 +16,7 @@ #include "./aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/intrapred_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" @@ -179,7 +180,6 @@ static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, } } -#if CONFIG_ALT_INTRA static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; } static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top, @@ -208,40 +208,6 @@ static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, } } -// Weights are quadratic from '1' to '1 / block_size', scaled by -// 2^sm_weight_log2_scale. -static const int sm_weight_log2_scale = 8; - -#if CONFIG_TX64X64 -// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST]) -#define MAX_BLOCK_DIM 64 -#else -#define MAX_BLOCK_DIM 32 -#endif // CONFIG_TX64X64 - -static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { - // Unused, because we always offset by bs, which is at least 2. - 0, 0, - // bs = 2 - 255, 128, - // bs = 4 - 255, 149, 85, 64, - // bs = 8 - 255, 197, 146, 105, 73, 50, 37, 32, - // bs = 16 - 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, - // bs = 32 - 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, - 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, -#if CONFIG_TX64X64 - // bs = 64 - 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, - 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, - 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, - 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4, -#endif // CONFIG_TX64X64 -}; - // Some basic checks on weights for smooth predictor. #define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \ pred_scale) \ @@ -344,21 +310,6 @@ static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, } #endif // CONFIG_SMOOTH_HV -#else - -static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, - const uint8_t *above, const uint8_t *left) { - int r, c; - int ytop_left = above[-1]; - - for (r = 0; r < bh; r++) { - for (c = 0; c < bw; c++) - dst[c] = clip_pixel(left[r] + above[c] - ytop_left); - dst += stride; - } -} -#endif // CONFIG_ALT_INTRA - static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { @@ -794,7 +745,6 @@ void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, DST(1, 1) = AVG3(J, I, X); } -#if CONFIG_ALT_INTRA static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { @@ -901,23 +851,7 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, dst += stride; } } -#endif - -#else -static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bw, - int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - int ytop_left = above[-1]; - (void)bd; - - for (r = 0; r < bh; r++) { - for (c = 0; c < bw; c++) - dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd); - dst += stride; - } -} -#endif // CONFIG_ALT_INTRA +#endif // CONFIG_SMOOTH_HV static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, @@ -1017,12 +951,16 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, intra_pred_sized(type, 16, 8) \ intra_pred_sized(type, 16, 32) \ intra_pred_sized(type, 32, 16) \ + intra_pred_sized(type, 32, 64) \ + intra_pred_sized(type, 64, 32) \ intra_pred_highbd_sized(type, 4, 8) \ intra_pred_highbd_sized(type, 8, 4) \ intra_pred_highbd_sized(type, 8, 16) \ intra_pred_highbd_sized(type, 16, 8) \ intra_pred_highbd_sized(type, 16, 32) \ - intra_pred_highbd_sized(type, 32, 16) + intra_pred_highbd_sized(type, 32, 16) \ + intra_pred_highbd_sized(type, 32, 64) \ + intra_pred_highbd_sized(type, 64, 32) #define intra_pred_above_4x4(type) \ intra_pred_sized(type, 8, 8) \ intra_pred_sized(type, 16, 16) \ @@ -1078,7 +1016,9 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, intra_pred_sized(type, 8, 16) \ intra_pred_sized(type, 16, 8) \ intra_pred_sized(type, 16, 32) \ - intra_pred_sized(type, 32, 16) + intra_pred_sized(type, 32, 16) \ + intra_pred_sized(type, 32, 64) \ + intra_pred_sized(type, 64, 32) #define intra_pred_above_4x4(type) \ intra_pred_sized(type, 8, 8) \ intra_pred_sized(type, 16, 16) \ @@ -1118,16 +1058,12 @@ intra_pred_above_4x4(d135) intra_pred_above_4x4(d153) intra_pred_allsizes(v) intra_pred_allsizes(h) -#if CONFIG_ALT_INTRA intra_pred_allsizes(smooth) #if CONFIG_SMOOTH_HV intra_pred_allsizes(smooth_v) intra_pred_allsizes(smooth_h) #endif // CONFIG_SMOOTH_HV intra_pred_allsizes(paeth) -#else -intra_pred_allsizes(tm) -#endif // CONFIG_ALT_INTRA intra_pred_allsizes(dc_128) intra_pred_allsizes(dc_left) intra_pred_allsizes(dc_top) diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h new file mode 100644 index 000000000..96da49b03 --- /dev/null +++ b/third_party/aom/aom_dsp/intrapred_common.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_INTRAPRED_COMMON_H +#define _AOM_DSP_INTRAPRED_COMMON_H + +#include "./aom_config.h" + +// Weights are quadratic from '1' to '1 / block_size', scaled by +// 2^sm_weight_log2_scale. +static const int sm_weight_log2_scale = 8; + +#if CONFIG_TX64X64 +// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST]) +#define MAX_BLOCK_DIM 64 +#else +#define MAX_BLOCK_DIM 32 +#endif // CONFIG_TX64X64 + +static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { + // Unused, because we always offset by bs, which is at least 2. + 0, 0, + // bs = 2 + 255, 128, + // bs = 4 + 255, 149, 85, 64, + // bs = 8 + 255, 197, 146, 105, 73, 50, 37, 32, + // bs = 16 + 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16, + // bs = 32 + 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, + 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, +#if CONFIG_TX64X64 + // bs = 64 + 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, + 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, + 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, + 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4, +#endif // CONFIG_TX64X64 +}; + +#endif // _AOM_DSP_INTRAPRED_COMMON_H diff --git a/third_party/aom/aom_dsp/inv_txfm.c b/third_party/aom/aom_dsp/inv_txfm.c index 398eb0a12..6b7c1c2ab 100644 --- a/third_party/aom/aom_dsp/inv_txfm.c +++ b/third_party/aom/aom_dsp/inv_txfm.c @@ -14,7 +14,8 @@ #include "./aom_dsp_rtcd.h" #include "aom_dsp/inv_txfm.h" -#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 +#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \ + CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64 #include "av1/common/daala_tx.h" #endif @@ -96,18 +97,6 @@ void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { } } -#if CONFIG_DAALA_DCT4 -void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { - int i; - od_coeff x[4]; - od_coeff y[4]; - for (i = 0; i < 4; i++) y[i] = input[i]; - od_bin_idct4(x, 1, y); - for (i = 0; i < 4; i++) output[i] = (tran_low_t)x[i]; -} - -#else - void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { tran_low_t step[4]; tran_high_t temp1, temp2; @@ -127,7 +116,6 @@ void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { output[2] = WRAPLOW(step[1] - step[2]); output[3] = WRAPLOW(step[0] - step[3]); } -#endif void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[4 * 4]; @@ -172,18 +160,6 @@ void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, } } -#if CONFIG_DAALA_DCT8 -void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { - int i; - od_coeff x[8]; - od_coeff y[8]; - for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i]; - od_bin_idct8(x, 1, y); - for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i]; -} - -#else - void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { tran_low_t step1[8], step2[8]; tran_high_t temp1, temp2; @@ -237,7 +213,6 @@ void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { output[6] = WRAPLOW(step1[1] - step1[6]); output[7] = WRAPLOW(step1[0] - step1[7]); } -#endif void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[8 * 8]; @@ -313,18 +288,6 @@ void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) { output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); } -#if CONFIG_DAALA_DCT8 -void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { - int i; - od_coeff x[8]; - od_coeff y[8]; - for (i = 0; i < 8; i++) y[i] = (od_coeff)input[i]; - od_bin_idst8(x, 1, y); - for (i = 0; i < 8; i++) output[i] = (tran_low_t)x[i]; -} - -#else - void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; @@ -402,8 +365,6 @@ void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { output[7] = WRAPLOW(-x1); } -#endif - void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; @@ -1224,7 +1185,7 @@ void aom_idct32_c(const tran_low_t *input, tran_low_t *output) { #if CONFIG_MRC_TX void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, - int stride, int *mask) { + int stride, uint8_t *mask) { tran_low_t out[32 * 32]; tran_low_t *outptr = out; int i, j; @@ -1265,7 +1226,7 @@ void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, } void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, - int *mask) { + uint8_t *mask) { tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; int i, j; @@ -1295,7 +1256,7 @@ void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, } void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, - int *mask) { + uint8_t *mask) { tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; int i, j; diff --git a/third_party/aom/aom_dsp/inv_txfm.h b/third_party/aom/aom_dsp/inv_txfm.h index a9c485e74..644a6599f 100644 --- a/third_party/aom/aom_dsp/inv_txfm.h +++ b/third_party/aom/aom_dsp/inv_txfm.h @@ -55,19 +55,22 @@ static INLINE tran_high_t check_range(tran_high_t input, int bd) { #if CONFIG_MRC_TX // These each perform dct but add coefficients based on a mask void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, - int stride, int *mask); + int stride, uint8_t *mask); void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, - int *mask); + uint8_t *mask); void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, - int *mask); + uint8_t *mask); #endif // CONFIG_MRC_TX void aom_idct4_c(const tran_low_t *input, tran_low_t *output); void aom_idct8_c(const tran_low_t *input, tran_low_t *output); void aom_idct16_c(const tran_low_t *input, tran_low_t *output); void aom_idct32_c(const tran_low_t *input, tran_low_t *output); +#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64 +void aom_idct64_c(const tran_low_t *input, tran_low_t *output); +#endif void aom_iadst4_c(const tran_low_t *input, tran_low_t *output); void aom_iadst8_c(const tran_low_t *input, tran_low_t *output); void aom_iadst16_c(const tran_low_t *input, tran_low_t *output); diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c index 7ea1e6b89..69f131378 100644 --- a/third_party/aom/aom_dsp/loopfilter.c +++ b/third_party/aom/aom_dsp/loopfilter.c @@ -23,6 +23,14 @@ static INLINE int8_t signed_char_clamp(int t) { #define PARALLEL_DEBLOCKING_11_TAP 0 #define PARALLEL_DEBLOCKING_9_TAP 0 +#if CONFIG_DEBLOCK_13TAP +#define PARALLEL_DEBLOCKING_13_TAP 1 +#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 1 +#else +#define PARALLEL_DEBLOCKING_13_TAP 0 +#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 0 +#endif + #if CONFIG_HIGHBITDEPTH static INLINE int16_t signed_char_clamp_high(int t, int bd) { switch (bd) { @@ -58,6 +66,19 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, return ~mask; } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1, + uint8_t p0, uint8_t q0, uint8_t q1, + uint8_t q2) { + int8_t mask = 0; + mask |= (abs(p1 - p0) > thresh) * -1; + mask |= (abs(q1 - q0) > thresh) * -1; + mask |= (abs(p2 - p0) > thresh) * -1; + mask |= (abs(q2 - q0) > thresh) * -1; + return ~mask; +} +#endif + static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2, uint8_t q3) { @@ -216,6 +237,25 @@ void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat, + uint8_t *op2, uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) { + if (flat && mask) { + const uint8_t p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2; + + // 5-tap filter [1, 2, 2, 2, 1] + *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); + *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); + *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); + *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); + } else { + filter4(mask, thresh, op1, op0, oq0, oq1); + } +} +#endif + static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, @@ -236,6 +276,32 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, } } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); + filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p); + ++s; + } +} +#endif + void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; @@ -268,6 +334,28 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif + + for (i = 0; i < count; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); + filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2); + s += pitch; + } +} +#endif + void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; @@ -297,6 +385,56 @@ void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); } +#if PARALLEL_DEBLOCKING_13_TAP +static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint8_t *op6, uint8_t *op5, + uint8_t *op4, uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, uint8_t *oq0, + uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, + uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) { + if (flat2 && flat && mask) { + const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, + p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, + q5 = *oq5, q6 = *oq6; + + // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] + *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, + 4); + *op4 = ROUND_POWER_OF_TWO( + p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); + *op3 = ROUND_POWER_OF_TWO( + p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); + *op2 = ROUND_POWER_OF_TWO( + p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, + 4); + *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + + q0 + q1 + q2 + q3 + q4, + 4); + *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + + q0 * 2 + q1 + q2 + q3 + q4 + q5, + 4); + *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + + q1 * 2 + q2 + q3 + q4 + q5 + q6, + 4); + *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + + q2 * 2 + q3 + q4 + q5 + q6 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, + 4); + } else { + filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} +#endif + #if PARALLEL_DEBLOCKING_11_TAP static INLINE void filter12(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint8_t *op5, uint8_t *op4, @@ -428,7 +566,16 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); -#if PARALLEL_DEBLOCKING_11_TAP +#if PARALLEL_DEBLOCKING_13_TAP + (void)p7; + (void)q7; + const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); + + filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p); + +#elif PARALLEL_DEBLOCKING_11_TAP const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5); filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p, @@ -482,7 +629,14 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); -#if PARALLEL_DEBLOCKING_11_TAP +#if PARALLEL_DEBLOCKING_13_TAP + (void)p7; + (void)q7; + const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); + + filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3, + s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6); +#elif PARALLEL_DEBLOCKING_11_TAP const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5); filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2, @@ -553,6 +707,21 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, return ~mask; } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2, + uint16_t p1, uint16_t p0, + uint16_t q0, uint16_t q1, + uint16_t q2, int bd) { + int8_t mask = 0; + int16_t thresh16 = (uint16_t)thresh << (bd - 8); + mask |= (abs(p1 - p0) > thresh16) * -1; + mask |= (abs(q1 - q0) > thresh16) * -1; + mask |= (abs(p2 - p0) > thresh16) * -1; + mask |= (abs(q2 - q0) > thresh16) * -1; + return ~mask; +} +#endif + static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, uint16_t q2, uint16_t q3, @@ -708,6 +877,26 @@ void aom_highbd_lpf_vertical_4_dual_c( bd); } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat, + uint16_t *op2, uint16_t *op1, uint16_t *op0, + uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, + int bd) { + if (flat && mask) { + const uint16_t p2 = *op2, p1 = *op1, p0 = *op0; + const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2; + + // 5-tap filter [1, 2, 2, 2, 1] + *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3); + *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3); + *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3); + *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3); + } else { + highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); + } +} +#endif + static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op3, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, @@ -754,6 +943,33 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, } } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); + highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, + s + 1 * p, s + 2 * p, bd); + ++s; + } +} +#endif + void aom_highbd_lpf_horizontal_8_dual_c( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, @@ -762,6 +978,30 @@ void aom_highbd_lpf_horizontal_8_dual_c( aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); } +#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif + + for (i = 0; i < count; ++i) { + const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = + highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); + highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2, + bd); + s += pitch; + } +} +#endif + void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { @@ -794,6 +1034,68 @@ void aom_highbd_lpf_vertical_8_dual_c( bd); } +#if PARALLEL_DEBLOCKING_13_TAP +static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat, + int8_t flat2, uint16_t *op6, uint16_t *op5, + uint16_t *op4, uint16_t *op3, uint16_t *op2, + uint16_t *op1, uint16_t *op0, uint16_t *oq0, + uint16_t *oq1, uint16_t *oq2, uint16_t *oq3, + uint16_t *oq4, uint16_t *oq5, uint16_t *oq6, + int bd) { + if (flat2 && flat && mask) { + const uint16_t p6 = *op6; + const uint16_t p5 = *op5; + const uint16_t p4 = *op4; + const uint16_t p3 = *op3; + const uint16_t p2 = *op2; + const uint16_t p1 = *op1; + const uint16_t p0 = *op0; + const uint16_t q0 = *oq0; + const uint16_t q1 = *oq1; + const uint16_t q2 = *oq2; + const uint16_t q3 = *oq3; + const uint16_t q4 = *oq4; + const uint16_t q5 = *oq5; + const uint16_t q6 = *oq6; + + // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1] + *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0, + 4); + *op4 = ROUND_POWER_OF_TWO( + p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4); + *op3 = ROUND_POWER_OF_TWO( + p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4); + *op2 = ROUND_POWER_OF_TWO( + p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3, + 4); + *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 + + q0 + q1 + q2 + q3 + q4, + 4); + *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 + + q0 * 2 + q1 + q2 + q3 + q4 + q5, + 4); + *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 + + q1 * 2 + q2 + q3 + q4 + q5 + q6, + 4); + *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 + + q2 * 2 + q3 + q4 + q5 + q6 * 2, + 4); + *oq2 = ROUND_POWER_OF_TWO( + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3, + 4); + *oq3 = ROUND_POWER_OF_TWO( + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO( + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7, + 4); + } else { + highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + bd); + } +} +#endif + static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint16_t *op7, uint16_t *op6, uint16_t *op5, uint16_t *op4, uint16_t *op3, @@ -887,6 +1189,16 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); + +#if PARALLEL_DEBLOCKING_13_TAP + const int8_t flat2 = + highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], + s[5 * p], s[6 * p], bd); + + highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, + s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd); +#else const int8_t flat2 = highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); @@ -895,6 +1207,7 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p, bd); +#endif ++s; } } @@ -937,12 +1250,21 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); +#if PARALLEL_DEBLOCKING_13_TAP + const int8_t flat2 = + highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd); + + highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, + s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, + s + 6, bd); +#else const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], s[7], bd); highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7, bd); +#endif s += p; } } diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c index 298065adb..3574da19f 100644 --- a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c @@ -407,6 +407,11 @@ void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint32_t tp1, tp2, tn1; uint32_t tp3, tp4, tn2; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + /* prefetch data to cache memory */ prefetch_load(src); prefetch_load(src + 32); diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c index c871702f4..dd4bc821a 100644 --- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c @@ -1304,6 +1304,8 @@ void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; uint32_t pos = 38; + (void)x_step_q4; + assert(x_step_q4 == 16); assert(y_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); @@ -1400,6 +1402,11 @@ void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, int w, int h) { int x, y; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + /* prefetch data to cache memory */ prefetch_load(src); prefetch_load(src + 32); diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c index dc8f20208..7c221ae89 100644 --- a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c +++ b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c @@ -17,6 +17,8 @@ void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + (void)above; + __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" "lb %[tmp2], 1(%[left]) \n\t" diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c index ea7c02810..0a21979c7 100644 --- a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c +++ b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c @@ -15,6 +15,7 @@ void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int32_t tmp1, tmp2, tmp3, tmp4; + (void)above; __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" @@ -78,148 +79,4 @@ void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); } - -void aom_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t abovel, abover; - int32_t left0, left1, left2, left3; - int32_t res0, res1; - int32_t resl; - int32_t resr; - int32_t top_left; - uint8_t *cm = aom_ff_cropTbl; - - __asm__ __volatile__( - "ulw %[resl], (%[above]) \n\t" - - "lbu %[left0], (%[left]) \n\t" - "lbu %[left1], 1(%[left]) \n\t" - "lbu %[left2], 2(%[left]) \n\t" - "lbu %[left3], 3(%[left]) \n\t" - - "lbu %[top_left], -1(%[above]) \n\t" - - "preceu.ph.qbl %[abovel], %[resl] \n\t" - "preceu.ph.qbr %[abover], %[resl] \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "replv.ph %[left1], %[left1] \n\t" - "replv.ph %[left2], %[left2] \n\t" - "replv.ph %[left3], %[left3] \n\t" - - "replv.ph %[top_left], %[top_left] \n\t" - - "addu.ph %[resl], %[abovel], %[left0] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left0] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "addu.ph %[resl], %[abovel], %[left1] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left1] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sb %[res1], 1(%[dst]) \n\t" - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "addu.ph %[resl], %[abovel], %[left2] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left2] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sb %[res1], 1(%[dst]) \n\t" - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "addu.ph %[resl], %[abovel], %[left3] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left3] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - : [abovel] "=&r"(abovel), [abover] "=&r"(abover), [left0] "=&r"(left0), - [left1] "=&r"(left1), [left2] "=&r"(left2), [res0] "=&r"(res0), - [res1] "=&r"(res1), [left3] "=&r"(left3), [resl] "=&r"(resl), - [resr] "=&r"(resr), [top_left] "=&r"(top_left) - : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), - [stride] "r"(stride), [cm] "r"(cm)); -} #endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c index 1114fbc00..d42a77c80 100644 --- a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c +++ b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c @@ -15,6 +15,7 @@ void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + (void)above; __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" @@ -146,458 +147,4 @@ void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride)); } - -void aom_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t abovel, abover; - int32_t abovel_1, abover_1; - int32_t left0; - int32_t res0, res1, res2, res3; - int32_t reshw; - int32_t top_left; - uint8_t *cm = aom_ff_cropTbl; - - __asm__ __volatile__( - "ulw %[reshw], (%[above]) \n\t" - "ulw %[top_left], 4(%[above]) \n\t" - - "lbu %[left0], (%[left]) \n\t" - - "preceu.ph.qbl %[abovel], %[reshw] \n\t" - "preceu.ph.qbr %[abover], %[reshw] \n\t" - "preceu.ph.qbl %[abovel_1], %[top_left] \n\t" - "preceu.ph.qbr %[abover_1], %[top_left] \n\t" - - "lbu %[top_left], -1(%[above]) \n\t" - "replv.ph %[left0], %[left0] \n\t" - - "replv.ph %[top_left], %[top_left] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 1(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 2(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 3(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 4(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 5(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 6(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 7(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - : [abovel] "=&r"(abovel), [abover] "=&r"(abover), - [abovel_1] "=&r"(abovel_1), [abover_1] "=&r"(abover_1), - [left0] "=&r"(left0), [res2] "=&r"(res2), [res3] "=&r"(res3), - [res0] "=&r"(res0), [res1] "=&r"(res1), [reshw] "=&r"(reshw), - [top_left] "=&r"(top_left) - : [above] "r"(above), [left] "r"(left), [dst] "r"(dst), - [stride] "r"(stride), [cm] "r"(cm)); -} #endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c index e8eaec7a9..bcb9c9df9 100644 --- a/third_party/aom/aom_dsp/mips/intrapred_msa.c +++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c @@ -382,176 +382,6 @@ static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { } } -static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint32_t val; - uint8_t top_left = src_top_ptr[-1]; - v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; - v16u8 src0, src1, src2, src3; - v8u16 src_top_left, vec0, vec1, vec2, vec3; - - src_top_left = (v8u16)__msa_fill_h(top_left); - val = LW(src_top_ptr); - src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); - - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - - ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, - src_left3, src_top, src0, src1, src2, src3); - HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); - SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); - ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); -} - -static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint64_t val; - uint8_t top_left = src_top_ptr[-1]; - uint32_t loop_cnt; - v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; - v8u16 src_top_left, vec0, vec1, vec2, vec3; - v16u8 src0, src1, src2, src3; - - val = LD(src_top_ptr); - src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val); - src_top_left = (v8u16)__msa_fill_h(top_left); - - for (loop_cnt = 2; loop_cnt--;) { - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - src_left += 4; - - ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, - src_left3, src_top, src0, src1, src2, src3); - HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); - SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint8_t top_left = src_top_ptr[-1]; - uint32_t loop_cnt; - v16i8 src_top, src_left0, src_left1, src_left2, src_left3; - v8u16 src_top_left, res_r, res_l; - - src_top = LD_SB(src_top_ptr); - src_top_left = (v8u16)__msa_fill_h(top_left); - - for (loop_cnt = 4; loop_cnt--;) { - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - src_left += 4; - - ILVRL_B2_UH(src_left0, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - - ILVRL_B2_UH(src_left1, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - - ILVRL_B2_UH(src_left2, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - - ILVRL_B2_UH(src_left3, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - } -} - -static void intra_predict_tm_32x32_msa(const uint8_t *src_top, - const uint8_t *src_left, uint8_t *dst, - int32_t dst_stride) { - uint8_t top_left = src_top[-1]; - uint32_t loop_cnt; - v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; - v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; - - LD_SB2(src_top, 16, src_top0, src_top1); - src_top_left = (v8u16)__msa_fill_h(top_left); - - for (loop_cnt = 8; loop_cnt--;) { - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - src_left += 4; - - ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - - ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - - ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - - ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - } -} - void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left) { (void)left; @@ -717,23 +547,3 @@ void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, intra_predict_128dc_32x32_msa(dst, y_stride); } - -void aom_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_4x4_msa(above, left, dst, y_stride); -} - -void aom_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_8x8_msa(above, left, dst, y_stride); -} - -void aom_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_16x16_msa(above, left, dst, y_stride); -} - -void aom_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_32x32_msa(above, left, dst, y_stride); -} diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h index 8a85e26f3..c69835173 100644 --- a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h +++ b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h @@ -24,10 +24,12 @@ extern "C" { #endif #if HAVE_DSPR2 +/* Note: this macro expects a local int32_t named out to exist, and will write + * to that variable. */ #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ ({ \ \ - int32_t tmp, out; \ + int32_t tmp; \ int dct_cost_rounding = DCT_CONST_ROUNDING; \ int in = input; \ \ diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h index 35db134e5..a517e810a 100644 --- a/third_party/aom/aom_dsp/prob.h +++ b/third_party/aom/aom_dsp/prob.h @@ -46,6 +46,14 @@ typedef uint16_t aom_cdf_prob; #define MAX_PROB 255 +#define LV_MAP_PROB 1 + +#define BR_NODE 1 + +#if CONFIG_ADAPT_SCAN +#define CACHE_SCAN_PROB 1 +#endif + #define aom_prob_half ((aom_prob)128) typedef int8_t aom_tree_index; @@ -149,7 +157,11 @@ static INLINE void av1_tree_to_cdf(const aom_tree_index *tree, void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree); static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { - const int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs); + int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs); +#if CONFIG_LV_MAP + if (nsymbs == 2) + rate = 4 + (cdf[nsymbs] > 7) + (cdf[nsymbs] > 15) + get_msb(nsymbs); +#endif const int rate2 = 5; int i, tmp; int diff; @@ -158,7 +170,7 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { tmp = AOM_ICDF(tmp0); diff = ((CDF_PROB_TOP - (nsymbs << rate2)) >> rate) << rate; // Single loop (faster) -#if !CONFIG_ANS && CONFIG_EC_SMALLMUL +#if !CONFIG_ANS for (i = 0; i < nsymbs - 1; ++i, tmp -= tmp0) { tmp -= (i == val ? diff : 0); cdf[i] += ((tmp - cdf[i]) >> rate); @@ -183,6 +195,12 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { cdf[nsymbs] += (cdf[nsymbs] < 32); } +#if CONFIG_LV_MAP +static INLINE void update_bin(aom_cdf_prob *cdf, int val, int nsymbs) { + update_cdf(cdf, val, nsymbs); +} +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c index 461c13729..d543f12d1 100644 --- a/third_party/aom/aom_dsp/psnr.c +++ b/third_party/aom/aom_dsp/psnr.c @@ -289,6 +289,27 @@ int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, } #endif // CONFIG_HIGHBITDEPTH +int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int plane, int highbd) { +#if CONFIG_HIGHBITDEPTH + if (highbd) { + switch (plane) { + case 0: return aom_highbd_get_y_sse(a, b); + case 1: return aom_highbd_get_u_sse(a, b); + case 2: return aom_highbd_get_v_sse(a, b); + default: assert(plane >= 0 && plane <= 2); return 0; + } + } +#endif + (void)highbd; + switch (plane) { + case 0: return aom_get_y_sse(a, b); + case 1: return aom_get_u_sse(a, b); + case 2: return aom_get_v_sse(a, b); + default: assert(plane >= 0 && plane <= 2); return 0; + } +} + #if CONFIG_HIGHBITDEPTH void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, @@ -296,9 +317,7 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; const int heights[3] = { a->y_crop_height, a->uv_crop_height, a->uv_crop_height }; - const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; - const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; int i; uint64_t total_sse = 0; @@ -313,14 +332,15 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, uint64_t sse; if (a->flags & YV12_FLAG_HIGHBITDEPTH) { if (input_shift) { - sse = highbd_get_sse_shift(a_planes[i], a_strides[i], b_planes[i], + sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h, input_shift); } else { - sse = highbd_get_sse(a_planes[i], a_strides[i], b_planes[i], + sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); } } else { - sse = get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); + sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, + h); } psnr->sse[1 + i] = sse; psnr->samples[1 + i] = samples; @@ -344,9 +364,7 @@ void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width }; const int heights[3] = { a->y_crop_height, a->uv_crop_height, a->uv_crop_height }; - const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer }; const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride }; - const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer }; const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride }; int i; uint64_t total_sse = 0; @@ -357,7 +375,7 @@ void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, const int h = heights[i]; const uint32_t samples = w * h; const uint64_t sse = - get_sse(a_planes[i], a_strides[i], b_planes[i], b_strides[i], w, h); + get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h); psnr->sse[1 + i] = sse; psnr->samples[1 + i] = samples; psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse); diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h index 480140e6f..df5f8f9f2 100644 --- a/third_party/aom/aom_dsp/psnr.h +++ b/third_party/aom/aom_dsp/psnr.h @@ -47,6 +47,8 @@ int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height); int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); +int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, + const YV12_BUFFER_CONFIG *b, int plane, int highbd); #if CONFIG_HIGHBITDEPTH int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c index fe98b6028..21bcc486a 100644 --- a/third_party/aom/aom_dsp/quantize.c +++ b/third_party/aom/aom_dsp/quantize.c @@ -12,18 +12,14 @@ #include "aom_dsp/quantize.h" #include "aom_mem/aom_mem.h" -static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, - const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, -#if CONFIG_AOM_QM - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, -#endif - const int log_scale) { +void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; @@ -37,20 +33,12 @@ static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Pre-scan pass for (i = (int)n_coeffs - 1; i >= 0; i--) { const int rc = scan[i]; -#if CONFIG_AOM_QM - const qm_val_t wt = qm_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; -#else - const int coeff = coeff_ptr[rc]; -#endif // CONFIG_AOM_QM -#if CONFIG_AOM_QM - if (coeff < (zbins[rc != 0] << AOM_QM_BITS) && - coeff > (nzbins[rc != 0] << AOM_QM_BITS)) + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS))) non_zero_count--; -#else - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) non_zero_count--; -#endif // CONFIG_AOM_QM else break; } @@ -64,35 +52,21 @@ static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; int tmp32; -#if CONFIG_AOM_QM - const qm_val_t wt = qm_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { -#else - if (abs_coeff >= zbins[rc != 0]) { -#endif // CONFIG_AOM_QM int64_t tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), INT16_MIN, INT16_MAX); -#if CONFIG_AOM_QM tmp *= wt; tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * quant_shift_ptr[rc != 0]) >> (16 - log_scale + AOM_QM_BITS)); // quantization -#else - tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> - (16 - log_scale)); // quantization -#endif // CONFIG_AOM_QM qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; -#if CONFIG_AOM_QM + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); const int dequant = - (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale); -#else - dqcoeff_ptr[rc] = - qcoeff_ptr[rc] * dequant_ptr[rc != 0] / (1 << log_scale); -#endif // CONFIG_AOM_QM if (tmp32) eob = i; } @@ -101,324 +75,25 @@ static void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob + 1; } -void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan -#if CONFIG_AOM_QM - , - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr -#endif - ) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, -#if CONFIG_AOM_QM - qm_ptr, iqm_ptr, -#endif - 0); -} - -void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan -#if CONFIG_AOM_QM - , - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr -#endif - ) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, -#if CONFIG_AOM_QM - qm_ptr, iqm_ptr, -#endif - 1); -} - -#if CONFIG_TX64X64 -void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan -#if CONFIG_AOM_QM - , - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr -#endif - ) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, -#if CONFIG_AOM_QM - qm_ptr, iqm_ptr, -#endif - 2); -} -#endif // CONFIG_TX64X64 - -#if CONFIG_AOM_QM -void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int64_t tmp, eob = -1; - int32_t tmp32; - int dequant = - (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (16 + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; - if (tmp32) eob = 0; - } - *eob_ptr = eob + 1; -} - -void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { - const int n_coeffs = 1024; - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int64_t tmp, eob = -1; - int32_t tmp32; - int dequant; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), - INT16_MIN, INT16_MAX); - tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (15 + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - dequant = - (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2; - if (tmp32) eob = 0; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_TX64X64 -void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { - const int n_coeffs = 1024; - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int64_t tmp, eob = -1; - int32_t tmp32; - int dequant; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2), - INT16_MIN, INT16_MAX); - tmp32 = (int32_t)((tmp * qm_ptr[rc] * quant) >> (14 + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - dequant = - (dequant_ptr * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4; - if (tmp32) eob = 0; - } - *eob_ptr = eob + 1; -} -#endif // CONFIG_TX64X64 - -void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, - uint16_t *eob_ptr, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr) { - int eob = -1; - int dequant = - (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + round_ptr[0]; - const uint32_t abs_qcoeff = - (uint32_t)((tmp * qm_ptr[0] * quant) >> (16 + AOM_QM_BITS)); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; -} - -void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr) { - const int n_coeffs = 1024; - int eob = -1; - int dequant; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); - const uint32_t abs_qcoeff = - (uint32_t)((tmp * qm_ptr[0] * quant) >> (15 + AOM_QM_BITS)); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 2; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_TX64X64 -void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr) { - const int n_coeffs = 1024; - int eob = -1; - int dequant; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2); - const uint32_t abs_qcoeff = - (uint32_t)((tmp * qm_ptr[0] * quant) >> (14 + AOM_QM_BITS)); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr * iqm_ptr[0] + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[0] = (qcoeff_ptr[0] * dequant) / 4; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; -} -#endif // CONFIG_TX64X64 - -void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr) { - int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - int dequant; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const qm_val_t wt = qm_ptr[rc]; - const int coeff = coeff_ptr[rc] * wt; - - if (coeff < (zbins[rc != 0] << AOM_QM_BITS) && - coeff > (nzbins[rc != 0] << AOM_QM_BITS)) - non_zero_count--; - else - break; - } - - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const qm_val_t wt = qm_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { - const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; - const int64_t tmpw = tmp1 * wt; - const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (16 + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant; - if (abs_qcoeff) eob = i; - } - } - } - *eob_ptr = eob + 1; -} - -void aom_highbd_quantize_b_32x32_c( +void highbd_quantize_b_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - - int idx = 0; - int idx_arr[1024]; + const qm_val_t *iqm_ptr, const int log_scale) { int i, eob = -1; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; int dequant; +#if CONFIG_TX64X64 + int idx_arr[4096]; +#else + int idx_arr[1024]; +#endif (void)iscan; + int idx = 0; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); @@ -427,13 +102,13 @@ void aom_highbd_quantize_b_32x32_c( // Pre-scan pass for (i = 0; i < n_coeffs; i++) { const int rc = scan[i]; - const qm_val_t wt = qm_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); const int coeff = coeff_ptr[rc] * wt; // If the coefficient is out of the base ZBIN range, keep it for // quantization. - if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) || - coeff <= (nzbins[rc != 0] << AOM_QM_BITS)) + if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) || + coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS))) idx_arr[idx++] = i; } @@ -443,134 +118,112 @@ void aom_highbd_quantize_b_32x32_c( const int rc = scan[idx_arr[i]]; const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); - const qm_val_t wt = qm_ptr[rc]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); const int64_t tmpw = tmp1 * wt; const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (15 + AOM_QM_BITS)); + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2; + dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale); if (abs_qcoeff) eob = idx_arr[i]; } } *eob_ptr = eob + 1; } -#if CONFIG_TX64X64 -void aom_highbd_quantize_b_64x64_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2), - ROUND_POWER_OF_TWO(zbin_ptr[1], 2) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - - int idx = 0; - int idx_arr[4096]; - int i, eob = -1; +void quantize_dc_helper(const tran_low_t *coeff_ptr, int n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t quant, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, + uint16_t *eob_ptr, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp; + int eob = -1; + int32_t tmp32; int dequant; - (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const qm_val_t wt = qm_ptr[rc]; - const int coeff = coeff_ptr[rc] * wt; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= (zbins[rc != 0] << AOM_QM_BITS) || - coeff <= (nzbins[rc != 0] << AOM_QM_BITS)) - idx_arr[idx++] = i; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const qm_val_t wt = qm_ptr[rc]; - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); - const int64_t tmpw = tmp1 * wt; - const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> (14 + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = - (dequant_ptr[rc != 0] * iqm_ptr[rc] + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 4; - if (abs_qcoeff) eob = idx_arr[i]; - } + const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale); + if (tmp32) eob = 0; } *eob_ptr = eob + 1; } -#endif // CONFIG_TX64X64 -#else // CONFIG_AOM_QM +/* These functions should only be called when quantisation matrices + are not used. */ +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); +} + +void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); +} + +#if CONFIG_TX64X64 +void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, + dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); +} +#endif // CONFIG_TX64X64 void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp, eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 16; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; - if (tmp) eob = 0; - } - *eob_ptr = eob + 1; + quantize_dc_helper(coeff_ptr, n_coeffs, skip_block, round_ptr, quant, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, + 0); } void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { - const int n_coeffs = 1024; - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp, eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), - INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 15; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; - if (tmp) eob = 0; - } - *eob_ptr = eob + 1; + quantize_dc_helper(coeff_ptr, 1024, skip_block, round_ptr, quant, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 1); } #if CONFIG_TX64X64 @@ -578,100 +231,8 @@ void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr) { - const int n_coeffs = 4096; - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp, eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2), - INT16_MIN, INT16_MAX); - tmp = (tmp * quant) >> 14; - qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 4; - if (tmp) eob = 0; - } - *eob_ptr = eob + 1; -} -#endif // CONFIG_TX64X64 - -void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, - uint16_t *eob_ptr) { - int eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + round_ptr[0]; - const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 16); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; -} - -void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, - uint16_t *eob_ptr) { - const int n_coeffs = 1024; - int eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1); - const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 15); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; -} - -#if CONFIG_TX64X64 -void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, - uint16_t *eob_ptr) { - const int n_coeffs = 4096; - int eob = -1; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 2); - const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> 14); - qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 4; - if (abs_qcoeff) eob = 0; - } - *eob_ptr = eob + 1; + quantize_dc_helper(coeff_ptr, 4096, skip_block, round_ptr, quant, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 2); } #endif // CONFIG_TX64X64 @@ -682,45 +243,10 @@ void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, non_zero_count = (int)n_coeffs, eob = -1; - const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) - non_zero_count--; - else - break; - } - - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - - if (abs_coeff >= zbins[rc != 0]) { - const int64_t tmp1 = abs_coeff + round_ptr[rc != 0]; - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; - if (abs_qcoeff) eob = i; - } - } - } - *eob_ptr = eob + 1; + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 0); } void aom_highbd_quantize_b_32x32_c( @@ -729,47 +255,10 @@ void aom_highbd_quantize_b_32x32_c( const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1), - ROUND_POWER_OF_TWO(zbin_ptr[1], 1) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - - int idx = 0; - int idx_arr[1024]; - int i, eob = -1; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = idx_arr[i]; - } - } - *eob_ptr = eob + 1; + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 1); } #if CONFIG_TX64X64 @@ -779,47 +268,9 @@ void aom_highbd_quantize_b_64x64_c( const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 2), - ROUND_POWER_OF_TWO(zbin_ptr[1], 2) }; - const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; - - int idx = 0; - int idx_arr[4096]; - int i, eob = -1; - (void)iscan; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) - idx_arr[idx++] = i; - } - - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; - if (abs_qcoeff) eob = idx_arr[i]; - } - } - *eob_ptr = eob + 1; + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, + round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 2); } #endif // CONFIG_TX64X64 -#endif // CONFIG_AOM_QM diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h index fe49b830f..03609e8b4 100644 --- a/third_party/aom/aom_dsp/quantize.h +++ b/third_party/aom/aom_dsp/quantize.h @@ -19,32 +19,57 @@ extern "C" { #endif -#if CONFIG_AOM_QM +void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan); + +void highbd_quantize_b_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +#if CONFIG_HIGHBITDEPTH +void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); +#endif + void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); + const int16_t dequant_ptr, uint16_t *eob_ptr); void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); + const int16_t dequant_ptr, uint16_t *eob_ptr); #if CONFIG_TX64X64 void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); + const int16_t dequant_ptr, uint16_t *eob_ptr); #endif // CONFIG_TX64X64 -void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr); + +#if CONFIG_AOM_QM #if CONFIG_HIGHBITDEPTH void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, @@ -64,32 +89,10 @@ void aom_highbd_quantize_dc_64x64( const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); #endif // CONFIG_TX64X64 -void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan, - const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr); #endif // CONFIG_HIGHBITDEPTH #else // CONFIG_AOM_QM -void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -#if CONFIG_TX64X64 -void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -#endif // CONFIG_TX64X64 #if CONFIG_HIGHBITDEPTH void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, const int16_t *round_ptr, diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c index b9c789ce5..6b8ca669b 100644 --- a/third_party/aom/aom_dsp/sad.c +++ b/third_party/aom/aom_dsp/sad.c @@ -163,11 +163,19 @@ sadMxN(8, 32) sadMxNx4D(8, 32) sadMxN(32, 8) sadMxNx4D(32, 8) +sadMxN(16, 64) +sadMxNx4D(16, 64) +sadMxN(64, 16) +sadMxNx4D(64, 16) +sadMxN(32, 128) +sadMxNx4D(32, 128) +sadMxN(128, 32) +sadMxNx4D(128, 32) #endif /* clang-format on */ #if CONFIG_HIGHBITDEPTH - static INLINE + static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int y, x; @@ -328,12 +336,20 @@ highbd_sadMxN(8, 32) highbd_sadMxNx4D(8, 32) highbd_sadMxN(32, 8) highbd_sadMxNx4D(32, 8) +highbd_sadMxN(16, 64) +highbd_sadMxNx4D(16, 64) +highbd_sadMxN(64, 16) +highbd_sadMxNx4D(64, 16) +highbd_sadMxN(32, 128) +highbd_sadMxNx4D(32, 128) +highbd_sadMxN(128, 32) +highbd_sadMxNx4D(128, 32) #endif /* clang-format on */ #endif // CONFIG_HIGHBITDEPTH -#if CONFIG_AV1 && CONFIG_EXT_INTER - static INLINE +#if CONFIG_AV1 + static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride, const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, const uint8_t *m, int m_stride, @@ -395,11 +411,15 @@ MASKSADMxN(4, 16) MASKSADMxN(16, 4) MASKSADMxN(8, 32) MASKSADMxN(32, 8) +MASKSADMxN(16, 64) +MASKSADMxN(64, 16) +MASKSADMxN(32, 128) +MASKSADMxN(128, 32) #endif /* clang-format on */ #if CONFIG_HIGHBITDEPTH - static INLINE + static INLINE unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, @@ -464,9 +484,13 @@ HIGHBD_MASKSADMXN(4, 16) HIGHBD_MASKSADMXN(16, 4) HIGHBD_MASKSADMXN(8, 32) HIGHBD_MASKSADMXN(32, 8) +HIGHBD_MASKSADMXN(16, 64) +HIGHBD_MASKSADMXN(64, 16) +HIGHBD_MASKSADMXN(32, 128) +HIGHBD_MASKSADMXN(128, 32) #endif #endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_AV1 && CONFIG_EXT_INTER +#endif // CONFIG_AV1 #if CONFIG_AV1 && CONFIG_MOTION_VAR // pre: predictor being evaluated @@ -522,11 +546,15 @@ OBMCSADMxN(4, 16) OBMCSADMxN(16, 4) OBMCSADMxN(8, 32) OBMCSADMxN(32, 8) +OBMCSADMxN(16, 64) +OBMCSADMxN(64, 16) +OBMCSADMxN(32, 128) +OBMCSADMxN(128, 32) #endif /* clang-format on */ #if CONFIG_HIGHBITDEPTH - static INLINE + static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int width, int height) { @@ -578,6 +606,10 @@ HIGHBD_OBMCSADMXN(4, 16) HIGHBD_OBMCSADMXN(16, 4) HIGHBD_OBMCSADMXN(8, 32) HIGHBD_OBMCSADMXN(32, 8) +HIGHBD_OBMCSADMXN(16, 64) +HIGHBD_OBMCSADMXN(64, 16) +HIGHBD_OBMCSADMXN(32, 128) +HIGHBD_OBMCSADMXN(128, 32) #endif /* clang-format on */ #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c index 141bf01c7..6ae378ff2 100644 --- a/third_party/aom/aom_dsp/ssim.c +++ b/third_party/aom/aom_dsp/ssim.c @@ -168,23 +168,16 @@ static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight) { - double a, b, c; - double ssimv; - - a = aom_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, - dest->y_stride, source->y_crop_width, source->y_crop_height); - - b = aom_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, - dest->uv_stride, source->uv_crop_width, source->uv_crop_height); - - c = aom_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, - dest->uv_stride, source->uv_crop_width, source->uv_crop_height); - - ssimv = a * .8 + .1 * (b + c); + double abc[3]; + for (int i = 0; i < 3; ++i) { + const int is_uv = i > 0; + abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i], + source->strides[is_uv], dest->strides[is_uv], + source->crop_widths[is_uv], source->crop_heights[is_uv]); + } *weight = 1; - - return ssimv; + return abc[0] * .8 + .1 * (abc[1] + abc[2]); } // traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity @@ -433,30 +426,19 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight, uint32_t bd, uint32_t in_bd) { - double a, b, c; - double ssimv; - uint32_t shift = 0; - assert(bd >= in_bd); - shift = bd - in_bd; - - a = aom_highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride, - dest->y_stride, source->y_crop_width, - source->y_crop_height, in_bd, shift); - - b = aom_highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride, - dest->uv_stride, source->uv_crop_width, - source->uv_crop_height, in_bd, shift); - - c = aom_highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride, - dest->uv_stride, source->uv_crop_width, - source->uv_crop_height, in_bd, shift); - - ssimv = a * .8 + .1 * (b + c); + const uint32_t shift = bd - in_bd; + + double abc[3]; + for (int i = 0; i < 3; ++i) { + const int is_uv = i > 0; + abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i], + source->strides[is_uv], dest->strides[is_uv], + source->crop_widths[is_uv], + source->crop_heights[is_uv], in_bd, shift); + } *weight = 1; - - return ssimv; + return abc[0] * .8 + .1 * (abc[1] + abc[2]); } - #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h index 01732ae64..ef9e9bc98 100644 --- a/third_party/aom/aom_dsp/txfm_common.h +++ b/third_party/aom/aom_dsp/txfm_common.h @@ -13,6 +13,7 @@ #define AOM_DSP_TXFM_COMMON_H_ #include "aom_dsp/aom_dsp_common.h" +#include "av1/common/enums.h" // Constants and Macros used by all idct/dct functions #define DCT_CONST_BITS 14 @@ -23,18 +24,25 @@ typedef struct txfm_param { // for both forward and inverse transforms - int tx_type; - int tx_size; + TX_TYPE tx_type; + TX_SIZE tx_size; int lossless; int bd; #if CONFIG_MRC_TX || CONFIG_LGT + int is_inter; +#endif // CONFIG_MRC_TX || CONFIG_LGT +#if CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED int stride; uint8_t *dst; -#endif // CONFIG_MRC_TX || CONFIG_LGT -#if CONFIG_LGT - int is_inter; +#if CONFIG_MRC_TX + int *valid_mask; + uint8_t *mask; +#endif // CONFIG_MRC_TX +#if CONFIG_LGT_FROM_PRED int mode; -#endif + int use_lgt; +#endif // CONFIG_LGT_FROM_PRED +#endif // CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED // for inverse transforms only #if CONFIG_ADAPT_SCAN const int16_t *eob_threshold; @@ -87,27 +95,608 @@ static const tran_high_t sinpi_4_9 = 15212; // 16384 * sqrt(2) static const tran_high_t Sqrt2 = 23170; +static const tran_high_t InvSqrt2 = 11585; static INLINE tran_high_t fdct_round_shift(tran_high_t input) { tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); return rv; } -#if CONFIG_LGT -// The Line Graph Transforms (LGTs) matrices are written as follows. -// Each 2D array is 16384 times an LGT matrix, which is the matrix of -// eigenvectors of the graph Laplacian matrices for the line graph. +#if CONFIG_LGT_FROM_PRED +// Use negative numbers so they do not coincide with lgt*[0][0], which are +// always nonnegative. +typedef enum { + DCT4 = -1, + ADST4 = -2, + DCT8 = -3, + ADST8 = -4, + DCT16 = -5, + ADST16 = -6, + DCT32 = -7, + ADST32 = -8, +} ButterflyLgt; -// LGT4 name: lgt4_140 -// Self loops: 1.400, 0.000, 0.000, 0.000 +/* These are some LGTs already implementated in the codec. When any of them + * is chosen, the flgt or ilgt function will call the existing fast + * transform instead of the matrix product implementation. Thus, we + * do not need the actual basis functions here */ +static const tran_high_t lgt4_000[1][1] = { { (tran_high_t)DCT4 } }; +static const tran_high_t lgt4_100[1][1] = { { (tran_high_t)ADST4 } }; +static const tran_high_t lgt8_000[1][1] = { { (tran_high_t)DCT8 } }; +static const tran_high_t lgt8_200[1][1] = { { (tran_high_t)ADST8 } }; +static const tran_high_t lgt16_000[1][1] = { { (tran_high_t)DCT16 } }; +static const tran_high_t lgt16_200[1][1] = { { (tran_high_t)ADST16 } }; +static const tran_high_t lgt32_000[1][1] = { { (tran_high_t)DCT32 } }; +static const tran_high_t lgt32_200[1][1] = { { (tran_high_t)ADST32 } }; + +/* The Line Graph Transforms (LGTs) matrices are written as follows. + Each 2D array is sqrt(2)*16384 times an LGT matrix, which is the + matrix of eigenvectors of the graph Laplacian matrix of the associated + line graph. Some of those transforms have fast algorithms but not + implemented yet for now. */ + +// LGT4 name: lgt4_150_000w3 +// Self loops: 1.500, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000 +static const tran_high_t lgt4_150_000w3[4][4] = { + { 0, 0, 0, 23170 }, + { 5991, 13537, 17825, 0 }, + { 15515, 10788, -13408, 0 }, + { 16133, -15403, 6275, 0 }, +}; + +// LGT4 name: lgt4_100_000w3 +// Self loops: 1.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000 +static const tran_high_t lgt4_100_000w3[4][4] = { + { 0, 0, 0, 23170 }, + { 7600, 13694, 17076, 0 }, + { 17076, 7600, -13694, 0 }, + { 13694, -17076, 7600, 0 }, +}; + +// LGT4 name: lgt4_060_000w3 +// Self loops: 0.600, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000 +static const tran_high_t lgt4_060_000w3[4][4] = { + { 0, 0, 0, 23170 }, + { 9449, 13755, 16075, 0 }, + { 17547, 4740, -14370, 0 }, + { 11819, -18034, 8483, 0 }, +}; + +// LGT4 name: lgt4_000w3 +// Self loops: 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000 +static const tran_high_t lgt4_000w3[4][4] = { + { 0, 0, 0, 23170 }, + { 13377, 13377, 13377, 0 }, + { 16384, 0, -16384, 0 }, + { 9459, -18919, 9459, 0 }, +}; + +// LGT4 name: lgt4_150_000w2 +// Self loops: 1.500, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000 +static const tran_high_t lgt4_150_000w2[4][4] = { + { 10362, 20724, 0, 0 }, + { 20724, -10362, 0, 0 }, + { 0, 0, 16384, 16384 }, + { 0, 0, 16384, -16384 }, +}; + +// LGT4 name: lgt4_100_000w2 +// Self loops: 1.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000 +static const tran_high_t lgt4_100_000w2[4][4] = { + { 12181, 19710, 0, 0 }, + { 19710, -12181, 0, 0 }, + { 0, 0, 16384, 16384 }, + { 0, 0, 16384, -16384 }, +}; + +// LGT4 name: lgt4_060_000w2 +// Self loops: 0.600, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000 +static const tran_high_t lgt4_060_000w2[4][4] = { + { 13831, 18590, 0, 0 }, + { 18590, -13831, 0, 0 }, + { 0, 0, 16384, 16384 }, + { 0, 0, 16384, -16384 }, +}; + +// LGT4 name: lgt4_000w2 +// Self loops: 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000 +static const tran_high_t lgt4_000w2[4][4] = { + { 16384, 16384, 0, 0 }, + { 16384, -16384, 0, 0 }, + { 0, 0, 16384, 16384 }, + { 0, 0, 16384, -16384 }, +}; + +// LGT4 name: lgt4_150_000w1 +// Self loops: 1.500, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000 +static const tran_high_t lgt4_150_000w1[4][4] = { + { 23170, 0, 0, 0 }, + { 0, 13377, 13377, 13377 }, + { 0, 16384, 0, -16384 }, + { 0, 9459, -18919, 9459 }, +}; + +// LGT4 name: lgt4_100_000w1 +// Self loops: 1.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000 +static const tran_high_t lgt4_100_000w1[4][4] = { + { 23170, 0, 0, 0 }, + { 0, 13377, 13377, 13377 }, + { 0, 16384, 0, -16384 }, + { 0, 9459, -18919, 9459 }, +}; + +// LGT4 name: lgt4_060_000w1 +// Self loops: 0.600, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000 +static const tran_high_t lgt4_060_000w1[4][4] = { + { 23170, 0, 0, 0 }, + { 0, 13377, 13377, 13377 }, + { 0, 16384, 0, -16384 }, + { 0, 9459, -18919, 9459 }, +}; + +// LGT4 name: lgt4_000w1 +// Self loops: 0.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000 +static const tran_high_t lgt4_000w1[4][4] = { + { 23170, 0, 0, 0 }, + { 0, 13377, 13377, 13377 }, + { 0, 16384, 0, -16384 }, + { 0, 9459, -18919, 9459 }, +}; + +// LGT4 name: lgt4_060 +// Self loops: 0.600, 0.000, 0.000, 0.000 // Edges: 1.000, 1.000, 1.000 -static const tran_high_t lgt4_140[4][4] = { - { 4206, 9518, 13524, 15674 }, - { 11552, 14833, 1560, -13453 }, - { 15391, -1906, -14393, 9445 }, - { 12201, -14921, 12016, -4581 }, +static const tran_high_t lgt4_060[4][4] = { + { 6971, 10504, 13060, 14400 }, + { 14939, 11211, -2040, -13559 }, + { 14096, -8258, -12561, 10593 }, + { 8150, -15253, 14295, -5784 }, +}; + +// LGT4 name: lgt4_150 +// Self loops: 1.500, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000 +static const tran_high_t lgt4_150[4][4] = { + { 3998, 9435, 13547, 15759 }, + { 11106, 15105, 1886, -13483 }, + { 15260, -1032, -14674, 9361 }, + { 12833, -14786, 11596, -4372 }, +}; + +// LGT8 name: lgt8_150_000w7 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 +static const tran_high_t lgt8_150_000w7[8][8] = { + { 0, 0, 0, 0, 0, 0, 0, 32768 }, + { 2522, 6185, 9551, 12461, 14775, 16381, 17204, 0 }, + { 7390, 15399, 16995, 11515, 1240, -9551, -16365, 0 }, + { 11716, 16625, 3560, -13353, -15831, -1194, 14733, 0 }, + { 15073, 8866, -14291, -10126, 13398, 11308, -12401, 0 }, + { 16848, -4177, -13724, 14441, 2923, -16628, 9513, 0 }, + { 15942, -14888, 5405, 7137, -15640, 15288, -6281, 0 }, + { 10501, -14293, 16099, -15670, 13063, -8642, 3021, 0 }, +}; + +// LGT8 name: lgt8_100_000w7 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 +static const tran_high_t lgt8_100_000w7[8][8] = { + { 0, 0, 0, 0, 0, 0, 0, 32768 }, + { 3518, 6883, 9946, 12575, 14654, 16093, 16829, 0 }, + { 9946, 16093, 16093, 9946, 0, -9946, -16093, 0 }, + { 14654, 14654, 0, -14654, -14654, 0, 14654, 0 }, + { 16829, 3518, -16093, -6883, 14654, 9946, -12575, 0 }, + { 16093, -9946, -9946, 16093, 0, -16093, 9946, 0 }, + { 12575, -16829, 9946, 3518, -14654, 16093, -6883, 0 }, + { 6883, -12575, 16093, -16829, 14654, -9946, 3518, 0 }, +}; + +// LGT8 name: lgt8_060_000w7 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 +static const tran_high_t lgt8_060_000w7[8][8] = { + { 0, 0, 0, 0, 0, 0, 0, 32768 }, + { 5087, 7951, 10521, 12701, 14411, 15587, 16186, 0 }, + { 13015, 16486, 14464, 7621, -1762, -10557, -15834, 0 }, + { 16581, 11475, -4050, -15898, -13311, 1362, 14798, 0 }, + { 16536, -1414, -16981, -3927, 15746, 8879, -12953, 0 }, + { 14104, -13151, -7102, 16932, -1912, -15914, 10385, 0 }, + { 10156, -17168, 11996, 1688, -14174, 16602, -7249, 0 }, + { 5295, -11721, 15961, -17224, 15274, -10476, 3723, 0 }, +}; + +// LGT8 name: lgt8_000w7 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 +static const tran_high_t lgt8_000w7[8][8] = { + { 0, 0, 0, 0, 0, 0, 0, 32768 }, + { 12385, 12385, 12385, 12385, 12385, 12385, 12385, 0 }, + { 17076, 13694, 7600, 0, -7600, -13694, -17076, 0 }, + { 15781, 3898, -10921, -17515, -10921, 3898, 15781, 0 }, + { 13694, -7600, -17076, 0, 17076, 7600, -13694, 0 }, + { 10921, -15781, -3898, 17515, -3898, -15781, 10921, 0 }, + { 7600, -17076, 13694, 0, -13694, 17076, -7600, 0 }, + { 3898, -10921, 15781, -17515, 15781, -10921, 3898, 0 }, +}; + +// LGT8 name: lgt8_150_000w6 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 +static const tran_high_t lgt8_150_000w6[8][8] = { + { 0, 0, 0, 0, 0, 0, 23170, 23170 }, + { 0, 0, 0, 0, 0, 0, 23170, -23170 }, + { 3157, 7688, 11723, 15002, 17312, 18506, 0, 0 }, + { 9167, 17832, 16604, 6164, -7696, -17286, 0, 0 }, + { 14236, 15584, -4969, -18539, -6055, 14938, 0, 0 }, + { 17558, 1891, -18300, 5288, 16225, -11653, 0, 0 }, + { 17776, -13562, -647, 14380, -17514, 7739, 0, 0 }, + { 12362, -16318, 17339, -15240, 10399, -3688, 0, 0 }, +}; + +// LGT8 name: lgt8_100_000w6 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 +static const tran_high_t lgt8_100_000w6[8][8] = { + { 0, 0, 0, 0, 0, 0, 23170, 23170 }, + { 0, 0, 0, 0, 0, 0, 23170, -23170 }, + { 4350, 8447, 12053, 14959, 16995, 18044, 0, 0 }, + { 12053, 18044, 14959, 4350, -8447, -16995, 0, 0 }, + { 16995, 12053, -8447, -18044, -4350, 14959, 0, 0 }, + { 18044, -4350, -16995, 8447, 14959, -12053, 0, 0 }, + { 14959, -16995, 4350, 12053, -18044, 8447, 0, 0 }, + { 8447, -14959, 18044, -16995, 12053, -4350, 0, 0 }, +}; + +// LGT8 name: lgt8_060_000w6 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 +static const tran_high_t lgt8_060_000w6[8][8] = { + { 0, 0, 0, 0, 0, 0, 23170, 23170 }, + { 0, 0, 0, 0, 0, 0, 23170, -23170 }, + { 6154, 9551, 12487, 14823, 16446, 17277, 0, 0 }, + { 15149, 17660, 12503, 1917, -9502, -16795, 0, 0 }, + { 18166, 7740, -11772, -17465, -2656, 15271, 0, 0 }, + { 16682, -8797, -15561, 10779, 14189, -12586, 0, 0 }, + { 12436, -18234, 7007, 10763, -18483, 8945, 0, 0 }, + { 6591, -14172, 18211, -17700, 12766, -4642, 0, 0 }, +}; + +// LGT8 name: lgt8_000w6 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 +static const tran_high_t lgt8_000w6[8][8] = { + { 0, 0, 0, 0, 0, 0, 23170, 23170 }, + { 0, 0, 0, 0, 0, 0, 23170, -23170 }, + { 13377, 13377, 13377, 13377, 13377, 13377, 0, 0 }, + { 18274, 13377, 4896, -4896, -13377, -18274, 0, 0 }, + { 16384, 0, -16384, -16384, 0, 16384, 0, 0 }, + { 13377, -13377, -13377, 13377, 13377, -13377, 0, 0 }, + { 9459, -18919, 9459, 9459, -18919, 9459, 0, 0 }, + { 4896, -13377, 18274, -18274, 13377, -4896, 0, 0 }, +}; + +// LGT8 name: lgt8_150_000w5 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 +static const tran_high_t lgt8_150_000w5[8][8] = { + { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, + { 0, 0, 0, 0, 0, 23170, 0, -23170 }, + { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, + { 4109, 9895, 14774, 18299, 20146, 0, 0, 0 }, + { 11753, 20300, 13161, -4148, -18252, 0, 0, 0 }, + { 17573, 10921, -16246, -12895, 14679, 0, 0, 0 }, + { 19760, -9880, -9880, 19760, -9880, 0, 0, 0 }, + { 14815, -18624, 17909, -12844, 4658, 0, 0, 0 }, +}; + +// LGT8 name: lgt8_100_000w5 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 +static const tran_high_t lgt8_100_000w5[8][8] = { + { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, + { 0, 0, 0, 0, 0, 23170, 0, -23170 }, + { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, + { 5567, 10683, 14933, 17974, 19559, 0, 0, 0 }, + { 14933, 19559, 10683, -5567, -17974, 0, 0, 0 }, + { 19559, 5567, -17974, -10683, 14933, 0, 0, 0 }, + { 17974, -14933, -5567, 19559, -10683, 0, 0, 0 }, + { 10683, -17974, 19559, -14933, 5567, 0, 0, 0 }, +}; + +// LGT8 name: lgt8_060_000w5 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 +static const tran_high_t lgt8_060_000w5[8][8] = { + { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, + { 0, 0, 0, 0, 0, 23170, 0, -23170 }, + { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, + { 7650, 11741, 15069, 17415, 18628, 0, 0, 0 }, + { 17824, 18002, 7558, -7345, -17914, 0, 0, 0 }, + { 19547, 569, -19303, -8852, 15505, 0, 0, 0 }, + { 15592, -17548, -2862, 19625, -11374, 0, 0, 0 }, + { 8505, -17423, 20218, -15907, 6006, 0, 0, 0 }, +}; + +// LGT8 name: lgt8_000w5 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 +static const tran_high_t lgt8_000w5[8][8] = { + { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, + { 0, 0, 0, 0, 0, 23170, 0, -23170 }, + { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, + { 14654, 14654, 14654, 14654, 14654, 0, 0, 0 }, + { 19710, 12181, 0, -12181, -19710, 0, 0, 0 }, + { 16766, -6404, -20724, -6404, 16766, 0, 0, 0 }, + { 12181, -19710, 0, 19710, -12181, 0, 0, 0 }, + { 6404, -16766, 20724, -16766, 6404, 0, 0, 0 }, +}; + +// LGT8 name: lgt8_150_000w4 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150_000w4[8][8] = { + { 5655, 13343, 19159, 22286, 0, 0, 0, 0 }, + { 15706, 21362, 2667, -19068, 0, 0, 0, 0 }, + { 21580, -1459, -20752, 13238, 0, 0, 0, 0 }, + { 18148, -20910, 16399, -6183, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, + { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, + { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, + { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, +}; + +// LGT8 name: lgt8_100_000w4 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_100_000w4[8][8] = { + { 7472, 14042, 18919, 21513, 0, 0, 0, 0 }, + { 18919, 18919, 0, -18919, 0, 0, 0, 0 }, + { 21513, -7472, -18919, 14042, 0, 0, 0, 0 }, + { 14042, -21513, 18919, -7472, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, + { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, + { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, + { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, +}; + +// LGT8 name: lgt8_060_000w4 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_060_000w4[8][8] = { + { 9858, 14855, 18470, 20365, 0, 0, 0, 0 }, + { 21127, 15855, -2886, -19175, 0, 0, 0, 0 }, + { 19935, -11679, -17764, 14980, 0, 0, 0, 0 }, + { 11525, -21570, 20217, -8180, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, + { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, + { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, + { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, +}; + +// LGT8 name: lgt8_000w4 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_000w4[8][8] = { + { 16384, 16384, 16384, 16384, 0, 0, 0, 0 }, + { 21407, 8867, -8867, -21407, 0, 0, 0, 0 }, + { 16384, -16384, -16384, 16384, 0, 0, 0, 0 }, + { 8867, -21407, 21407, -8867, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, + { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, + { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, + { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, +}; + +// LGT8 name: lgt8_150_000w3 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150_000w3[8][8] = { + { 8473, 19144, 25209, 0, 0, 0, 0, 0 }, + { 21942, 15257, -18961, 0, 0, 0, 0, 0 }, + { 22815, -21783, 8874, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, + { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, + { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, + { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, + { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, +}; + +// LGT8 name: lgt8_100_000w3 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_100_000w3[8][8] = { + { 10747, 19366, 24149, 0, 0, 0, 0, 0 }, + { 24149, 10747, -19366, 0, 0, 0, 0, 0 }, + { 19366, -24149, 10747, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, + { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, + { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, + { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, + { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, +}; + +// LGT8 name: lgt8_060_000w3 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_060_000w3[8][8] = { + { 13363, 19452, 22733, 0, 0, 0, 0, 0 }, + { 24815, 6704, -20323, 0, 0, 0, 0, 0 }, + { 16715, -25503, 11997, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, + { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, + { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, + { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, + { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, +}; + +// LGT8 name: lgt8_000w3 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_000w3[8][8] = { + { 18919, 18919, 18919, 0, 0, 0, 0, 0 }, + { 23170, 0, -23170, 0, 0, 0, 0, 0 }, + { 13377, -26755, 13377, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, + { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, + { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, + { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, + { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, +}; + +// LGT8 name: lgt8_150_000w2 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150_000w2[8][8] = { + { 14654, 29309, 0, 0, 0, 0, 0, 0 }, + { 29309, -14654, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, + { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, + { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, + { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, + { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, + { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, +}; + +// LGT8 name: lgt8_100_000w2 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_100_000w2[8][8] = { + { 17227, 27874, 0, 0, 0, 0, 0, 0 }, + { 27874, -17227, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, + { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, + { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, + { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, + { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, + { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, +}; + +// LGT8 name: lgt8_060_000w2 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_060_000w2[8][8] = { + { 19560, 26290, 0, 0, 0, 0, 0, 0 }, + { 26290, -19560, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, + { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, + { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, + { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, + { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, + { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, +}; + +// LGT8 name: lgt8_000w2 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_000w2[8][8] = { + { 23170, 23170, 0, 0, 0, 0, 0, 0 }, + { 23170, -23170, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, + { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, + { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, + { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, + { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, + { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, +}; + +// LGT8 name: lgt8_150_000w1 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150_000w1[8][8] = { + { 32768, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, + { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, + { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, + { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, + { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, + { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, + { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, +}; + +// LGT8 name: lgt8_100_000w1 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_100_000w1[8][8] = { + { 32768, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, + { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, + { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, + { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, + { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, + { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, + { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, +}; + +// LGT8 name: lgt8_060_000w1 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_060_000w1[8][8] = { + { 32768, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, + { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, + { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, + { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, + { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, + { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, + { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, +}; + +// LGT8 name: lgt8_000w1 +// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_000w1[8][8] = { + { 32768, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, + { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, + { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, + { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, + { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, + { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, + { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, +}; + +// LGT8 name: lgt8_060 +// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_060[8][8] = { + { 4295, 6746, 8999, 10987, 12653, 13947, 14832, 15280 }, + { 11303, 15101, 14912, 10786, 3812, -4168, -11047, -15010 }, + { 15051, 13208, 1823, -10879, -15721, -9207, 3959, 14265 }, + { 15871, 3800, -13441, -12395, 5516, 15922, 4665, -12939 }, + { 14630, -7269, -13926, 8618, 13091, -9886, -12133, 11062 }, + { 12008, -14735, 180, 14586, -12245, -4458, 15932, -8720 }, + { 8472, -15623, 14088, -4721, -7272, 15221, -14708, 6018 }, + { 4372, -9862, 13927, -15981, 15727, -13202, 8770, -3071 }, +}; + +// LGT8 name: lgt8_100 +// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_100[8][8] = { + { 2921, 5742, 8368, 10708, 12684, 14228, 15288, 15827 }, + { 8368, 14228, 15827, 12684, 5742, -2921, -10708, -15288 }, + { 12684, 15288, 5742, -8368, -15827, -10708, 2921, 14228 }, + { 15288, 8368, -10708, -14228, 2921, 15827, 5742, -12684 }, + { 15827, -2921, -15288, 5742, 14228, -8368, -12684, 10708 }, + { 14228, -12684, -2921, 15288, -10708, -5742, 15827, -8368 }, + { 10708, -15827, 12684, -2921, -8368, 15288, -14228, 5742 }, + { 5742, -10708, 14228, -15827, 15288, -12684, 8368, -2921 }, }; +#endif // CONFIG_LGT_FROM_PRED +#if CONFIG_LGT || CONFIG_LGT_FROM_PRED // LGT4 name: lgt4_170 // Self loops: 1.700, 0.000, 0.000, 0.000 // Edges: 1.000, 1.000, 1.000 @@ -118,18 +707,14 @@ static const tran_high_t lgt4_170[4][4] = { { 14138, -14420, 10663, -3920 }, }; -// LGT8 name: lgt8_150 -// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_150[8][8] = { - { 2075, 5110, 7958, 10511, 12677, 14376, 15544, 16140 }, - { 6114, 13307, 16196, 13845, 7015, -2084, -10509, -15534 }, - { 9816, 16163, 8717, -6168, -15790, -11936, 2104, 14348 }, - { 12928, 12326, -7340, -15653, 242, 15763, 6905, -12632 }, - { 15124, 3038, -16033, 1758, 15507, -6397, -13593, 10463 }, - { 15895, -7947, -7947, 15895, -7947, -7947, 15895, -7947 }, - { 14325, -15057, 9030, 1050, -10659, 15483, -13358, 5236 }, - { 9054, -12580, 14714, -15220, 14043, -11312, 7330, -2537 }, +// LGT4 name: lgt4_140 +// Self loops: 1.400, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000 +static const tran_high_t lgt4_140[4][4] = { + { 4206, 9518, 13524, 15674 }, + { 11552, 14833, 1560, -13453 }, + { 15391, -1906, -14393, 9445 }, + { 12201, -14921, 12016, -4581 }, }; // LGT8 name: lgt8_170 @@ -145,5 +730,19 @@ static const tran_high_t lgt8_170[8][8] = { { 15533, -13869, 6559, 3421, -12009, 15707, -13011, 5018 }, { 11357, -13726, 14841, -14600, 13025, -10259, 6556, -2254 }, }; -#endif // CONFIG_LGT + +// LGT8 name: lgt8_150 +// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 +// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 +static const tran_high_t lgt8_150[8][8] = { + { 2075, 5110, 7958, 10511, 12677, 14376, 15544, 16140 }, + { 6114, 13307, 16196, 13845, 7015, -2084, -10509, -15534 }, + { 9816, 16163, 8717, -6168, -15790, -11936, 2104, 14348 }, + { 12928, 12326, -7340, -15653, 242, 15763, 6905, -12632 }, + { 15124, 3038, -16033, 1758, 15507, -6397, -13593, 10463 }, + { 15895, -7947, -7947, 15895, -7947, -7947, 15895, -7947 }, + { 14325, -15057, 9030, 1050, -10659, 15483, -13358, 5236 }, + { 9054, -12580, 14714, -15220, 14043, -11312, 7330, -2537 }, +}; +#endif // CONFIG_LGT || CONFIG_LGT_FROM_PRED #endif // AOM_DSP_TXFM_COMMON_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c index a4c3616e7..3c99aa155 100644 --- a/third_party/aom/aom_dsp/variance.c +++ b/third_party/aom/aom_dsp/variance.c @@ -256,7 +256,13 @@ VARIANCES(4, 16) VARIANCES(16, 4) VARIANCES(8, 32) VARIANCES(32, 8) -#endif +VARIANCES(16, 64) +VARIANCES(64, 16) +#if CONFIG_EXT_PARTITION +VARIANCES(32, 128) +VARIANCES(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES GET_VAR(16, 16) GET_VAR(8, 8) @@ -661,7 +667,13 @@ HIGHBD_VARIANCES(4, 16) HIGHBD_VARIANCES(16, 4) HIGHBD_VARIANCES(8, 32) HIGHBD_VARIANCES(32, 8) -#endif +HIGHBD_VARIANCES(16, 64) +HIGHBD_VARIANCES(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_VARIANCES(32, 128) +HIGHBD_VARIANCES(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) @@ -761,7 +773,7 @@ void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, } #endif // CONFIG_HIGHBITDEPTH -#if CONFIG_AV1 && CONFIG_EXT_INTER +#if CONFIG_AV1 void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, @@ -848,7 +860,13 @@ MASK_SUBPIX_VAR(4, 16) MASK_SUBPIX_VAR(16, 4) MASK_SUBPIX_VAR(8, 32) MASK_SUBPIX_VAR(32, 8) -#endif +MASK_SUBPIX_VAR(16, 64) +MASK_SUBPIX_VAR(64, 16) +#if CONFIG_EXT_PARTITION +MASK_SUBPIX_VAR(32, 128) +MASK_SUBPIX_VAR(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #if CONFIG_HIGHBITDEPTH void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, @@ -985,9 +1003,15 @@ HIGHBD_MASK_SUBPIX_VAR(4, 16) HIGHBD_MASK_SUBPIX_VAR(16, 4) HIGHBD_MASK_SUBPIX_VAR(8, 32) HIGHBD_MASK_SUBPIX_VAR(32, 8) -#endif +HIGHBD_MASK_SUBPIX_VAR(16, 64) +HIGHBD_MASK_SUBPIX_VAR(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_MASK_SUBPIX_VAR(32, 128) +HIGHBD_MASK_SUBPIX_VAR(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_AV1 && CONFIG_EXT_INTER +#endif // CONFIG_AV1 #if CONFIG_AV1 && CONFIG_MOTION_VAR static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, @@ -1094,7 +1118,17 @@ OBMC_VAR(8, 32) OBMC_SUBPIX_VAR(8, 32) OBMC_VAR(32, 8) OBMC_SUBPIX_VAR(32, 8) -#endif +OBMC_VAR(16, 64) +OBMC_SUBPIX_VAR(16, 64) +OBMC_VAR(64, 16) +OBMC_SUBPIX_VAR(64, 16) +#if CONFIG_EXT_PARTITION +OBMC_VAR(32, 128) +OBMC_SUBPIX_VAR(32, 128) +OBMC_VAR(128, 32) +OBMC_SUBPIX_VAR(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #if CONFIG_HIGHBITDEPTH static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, @@ -1287,6 +1321,16 @@ HIGHBD_OBMC_VAR(8, 32) HIGHBD_OBMC_SUBPIX_VAR(8, 32) HIGHBD_OBMC_VAR(32, 8) HIGHBD_OBMC_SUBPIX_VAR(32, 8) -#endif +HIGHBD_OBMC_VAR(16, 64) +HIGHBD_OBMC_SUBPIX_VAR(16, 64) +HIGHBD_OBMC_VAR(64, 16) +HIGHBD_OBMC_SUBPIX_VAR(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_OBMC_VAR(32, 128) +HIGHBD_OBMC_SUBPIX_VAR(32, 128) +HIGHBD_OBMC_VAR(128, 32) +HIGHBD_OBMC_SUBPIX_VAR(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #endif // CONFIG_HIGHBITDEPTH #endif // CONFIG_AV1 && CONFIG_MOTION_VAR diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h index 20f0895cb..a193df467 100644 --- a/third_party/aom/aom_dsp/variance.h +++ b/third_party/aom/aom_dsp/variance.h @@ -54,7 +54,7 @@ typedef unsigned int (*aom_subp_avg_variance_fn_t)( const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, unsigned int *sse, const uint8_t *second_pred); -#if CONFIG_AV1 && CONFIG_EXT_INTER +#if CONFIG_AV1 typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, @@ -64,7 +64,7 @@ typedef unsigned int (*aom_masked_subpixvariance_fn_t)( const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); -#endif // CONFIG_AV1 && CONFIG_EXT_INTER +#endif // CONFIG_AV1 #if CONFIG_AV1 && CONFIG_MOTION_VAR typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, @@ -90,10 +90,8 @@ typedef struct aom_variance_vtable { aom_sad_multi_fn_t sdx3f; aom_sad_multi_fn_t sdx8f; aom_sad_multi_d_fn_t sdx4df; -#if CONFIG_EXT_INTER aom_masked_sad_fn_t msdf; aom_masked_subpixvariance_fn_t msvf; -#endif // CONFIG_EXT_INTER #if CONFIG_MOTION_VAR aom_obmc_sad_fn_t osdf; aom_obmc_variance_fn_t ovf; diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm index 357f37401..8688fb544 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm @@ -346,9 +346,15 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ psraw m0, 7 psraw m4, 7 %ifidn %1, h8_add_src +%if ARCH_X86=1 && CONFIG_PIC=1 + pcmpeqb m2, m2 ;all ones + psrlw m2, 8 ;even_byte_mask +%else + mova m2, [GLOBAL(even_byte_mask)] +%endif movu m5, [srcq] mova m7, m5 - pand m5, [even_byte_mask] + pand m5, m2 psrlw m7, 8 paddsw m0, m5 paddsw m4, m7 diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h new file mode 100644 index 000000000..5f9596a74 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/common_avx2.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_COMMON_AVX2_H +#define AOM_DSP_X86_COMMON_AVX2_H + +#include + +#include "./aom_config.h" + +// Note: in and out could have the same value +static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { + __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); + __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); + __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); + __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); + __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); + __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); + __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); + __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); + + __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); + __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); + __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); + __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); + __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); + __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); + __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); + __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); + + // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b + // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f + // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b + // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f + // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b + // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f + // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b + // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f + + // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b + // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f + // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb + // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf + // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db + // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df + // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb + // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff + + __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); + __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); + __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); + __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); + __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); + __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); + __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); + __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); + + __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); + __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); + __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); + __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); + __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); + __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); + __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); + __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); + + // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 + // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b + // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d + // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f + // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 + // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b + // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d + // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f + + // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 + // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb + // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd + // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf + // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 + // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb + // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd + // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff + + tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); + tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); + tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); + tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); + tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); + tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); + tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); + tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); + + tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); + tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); + tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); + tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); + tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); + tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); + tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); + tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); + + // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 + // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 + // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a + // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b + // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c + // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d + // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e + // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f + + // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 + // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 + // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa + // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb + // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc + // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd + // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe + // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff + + out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 + out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 + out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); + out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); + out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); + out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); + out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); + out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); + + out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); + out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); + out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); + out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); + out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); + out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); + out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); + out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); +} +#endif diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h index d3aceae00..86df4a6f6 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h @@ -15,21 +15,21 @@ #include "./aom_config.h" static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) { -#if CONFIG_HIGHBITDEPTH - const __m256i zero = _mm256_setzero_si256(); - const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); + if (sizeof(tran_low_t) == 4) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); - __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); - __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); + __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); + __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); - __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); - __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); + __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); + __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); - _mm256_storeu_si256((__m256i *)out, y0); - _mm256_storeu_si256((__m256i *)(out + 8), y1); -#else - _mm256_storeu_si256((__m256i *)out, *coeff); -#endif + _mm256_storeu_si256((__m256i *)out, y0); + _mm256_storeu_si256((__m256i *)(out + 8), y1); + } else { + _mm256_storeu_si256((__m256i *)out, *coeff); + } } #endif // AOM_DSP_X86_FWD_TXFM_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h index 26b2db2e0..58e8971dd 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -247,16 +247,16 @@ static INLINE int k_check_epi32_overflow_32( } static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { -#if CONFIG_HIGHBITDEPTH - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_store_si128((__m128i *)(dst_ptr), out0); - _mm_store_si128((__m128i *)(dst_ptr + 4), out1); -#else - _mm_store_si128((__m128i *)(dst_ptr), *poutput); -#endif // CONFIG_HIGHBITDEPTH + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + } else { + _mm_store_si128((__m128i *)(dst_ptr), *poutput); + } } static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c new file mode 100644 index 000000000..41b55c985 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +// D45E_PRED +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y, + const __m256i *z) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a = _mm256_avg_epu16(*x, *z); + const __m256i b = + _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one)); + return _mm256_avg_epu16(b, *y); +} + +static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1, + const __m256i *a2, uint16_t **dst, + ptrdiff_t stride) { + const __m256i y = avg3_epu16(a0, a1, a2); + _mm256_storeu_si256((__m256i *)*dst, y); + *dst += stride; +} + +void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + + d45e_w16(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x0, &x1, &x2, &dst, stride); + } while (i < 9); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 9)); + x0 = _mm256_insert_epi16(x0, above[23], 15); + const __m256i y = avg3_epu16(&x1, &x2, &x0); + _mm256_storeu_si256((__m256i *)dst, y); +} + +void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + + d45e_w16(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x0, &x1, &x2, &dst, stride); + } while (i < 15); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 15)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + 16)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + 17)); + x2 = _mm256_insert_epi16(x2, above[31], 15); + const __m256i y = avg3_epu16(&x0, &x1, &x2); + _mm256_storeu_si256((__m256i *)dst, y); +} + +void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + + d45e_w16(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x1, &x2, &x0, &dst, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x2, &x0, &x1, &dst, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); + d45e_w16(&x0, &x1, &x2, &dst, stride); + } while (i < 33); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 33)); + x0 = _mm256_insert_epi16(x0, above[47], 15); + const __m256i y = avg3_epu16(&x1, &x2, &x0); + _mm256_storeu_si256((__m256i *)dst, y); +} + +void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16)); + __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17)); + __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18)); + + uint16_t *dst1 = dst; + uint16_t *dst2 = dst + 16; + + d45e_w16(&x0, &x1, &x2, &dst1, stride); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x1, &x2, &x0, &dst1, stride); + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y1, &y2, &y0, &dst2, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x2, &x0, &x1, &dst1, stride); + y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y2, &y0, &y1, &dst2, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x0, &x1, &x2, &dst1, stride); + y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + } while (i < 15); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 15)); + d45e_w16(&x1, &x2, &x0, &dst1, stride); + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15)); + d45e_w16(&y1, &y2, &y0, &dst2, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + 16)); + d45e_w16(&x2, &x0, &x1, &dst1, stride); + y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16)); + d45e_w16(&y2, &y0, &y1, &dst2, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + 17)); + __m256i u = avg3_epu16(&x0, &x1, &x2); + _mm256_storeu_si256((__m256i *)dst1, u); + + y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17)); + y2 = _mm256_insert_epi16(y2, above[47], 15); + u = avg3_epu16(&y0, &y1, &y2); + _mm256_storeu_si256((__m256i *)dst2, u); +} + +void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m256i x0 = _mm256_loadu_si256((const __m256i *)above); + __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); + __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); + __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16)); + __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17)); + __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18)); + + uint16_t *dst1 = dst; + uint16_t *dst2 = dst + 16; + + d45e_w16(&x0, &x1, &x2, &dst1, stride); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + + int i = 3; + do { + x0 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x1, &x2, &x0, &dst1, stride); + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y1, &y2, &y0, &dst2, stride); + + x1 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x2, &x0, &x1, &dst1, stride); + y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y2, &y0, &y1, &dst2, stride); + + x2 = _mm256_loadu_si256((const __m256i *)(above + i)); + d45e_w16(&x0, &x1, &x2, &dst1, stride); + y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); + d45e_w16(&y0, &y1, &y2, &dst2, stride); + } while (i < 33); + + x0 = _mm256_loadu_si256((const __m256i *)(above + 33)); + __m256i u = avg3_epu16(&x1, &x2, &x0); + _mm256_storeu_si256((__m256i *)dst1, u); + + y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33)); + y0 = _mm256_insert_epi16(y0, above[63], 15); + u = avg3_epu16(&y1, &y2, &y0); + _mm256_storeu_si256((__m256i *)dst2, u); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm index 5d84ef8a7..91b3d126c 100644 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm @@ -257,200 +257,3 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above dec nlines4d jnz .loop REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps - movd m1, [aboveq-2] - movq m0, [aboveq] - pshuflw m1, m1, 0x0 - movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4 - movlhps m1, m1 ; tl tl tl tl tl tl tl tl - ; Get the values to compute the maximum value at this bit depth - pcmpeqw m3, m3 - movd m4, bpsd - psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl - psllw m3, m4 - pcmpeqw m2, m2 - pxor m4, m4 ; min possible value - pxor m3, m2 ; max possible value - mova m1, [leftq] - pshuflw m2, m1, 0x0 - pshuflw m5, m1, 0x55 - movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m2, m3 - pmaxsw m2, m4 - ;Store the values - movq [dstq ], m2 - movhpd [dstq+strideq*2], m2 - lea dstq, [dstq+strideq*4] - pshuflw m2, m1, 0xaa - pshuflw m5, m1, 0xff - movlhps m2, m5 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m2, m3 - pmaxsw m2, m4 - ;Store the values - movq [dstq ], m2 - movhpd [dstq+strideq*2], m2 - RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one - movd m1, [aboveq-2] - mova m0, [aboveq] - pshuflw m1, m1, 0x0 - ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - pxor m3, m3 - pxor m4, m4 - pinsrw m3, oned, 0 - pinsrw m4, bpsd, 0 - pshuflw m3, m3, 0x0 - DEFINE_ARGS dst, stride, line, left - punpcklqdq m3, m3 - mov lineq, -4 - mova m2, m3 - punpcklqdq m1, m1 - psllw m3, m4 - add leftq, 16 - psubw m3, m2 ; max possible value - pxor m4, m4 ; min possible value - psubw m0, m1 -.loop: - movd m1, [leftq+lineq*4] - movd m2, [leftq+lineq*4+2] - pshuflw m1, m1, 0x0 - pshuflw m2, m2, 0x0 - punpcklqdq m1, m1 - punpcklqdq m2, m2 - paddw m1, m0 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m1, m3 - pminsw m2, m3 - pmaxsw m1, m4 - pmaxsw m2, m4 - ;Store the values - mova [dstq ], m1 - mova [dstq+strideq*2], m2 - lea dstq, [dstq+strideq*4] - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps - movd m2, [aboveq-2] - mova m0, [aboveq] - mova m1, [aboveq+16] - pshuflw m2, m2, 0x0 - ; Get the values to compute the maximum value at this bit depth - pcmpeqw m3, m3 - movd m4, bpsd - punpcklqdq m2, m2 - psllw m3, m4 - pcmpeqw m5, m5 - pxor m4, m4 ; min possible value - pxor m3, m5 ; max possible value - DEFINE_ARGS dst, stride, line, left - mov lineq, -8 - psubw m0, m2 - psubw m1, m2 -.loop: - movd m7, [leftq] - pshuflw m5, m7, 0x0 - pshuflw m2, m7, 0x55 - punpcklqdq m5, m5 ; l1 l1 l1 l1 l1 l1 l1 l1 - punpcklqdq m2, m2 ; l2 l2 l2 l2 l2 l2 l2 l2 - paddw m6, m5, m0 ; t1-tl+l1 to t4-tl+l1 - paddw m5, m1 ; t5-tl+l1 to t8-tl+l1 - pminsw m6, m3 - pminsw m5, m3 - pmaxsw m6, m4 ; Clamp to the bit-depth - pmaxsw m5, m4 - mova [dstq ], m6 - mova [dstq +16], m5 - paddw m6, m2, m0 - paddw m2, m1 - pminsw m6, m3 - pminsw m2, m3 - pmaxsw m6, m4 - pmaxsw m2, m4 - mova [dstq+strideq*2 ], m6 - mova [dstq+strideq*2+16], m2 - lea dstq, [dstq+strideq*4] - inc lineq - lea leftq, [leftq+4] - - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps - movd m0, [aboveq-2] - mova m1, [aboveq] - mova m2, [aboveq+16] - mova m3, [aboveq+32] - mova m4, [aboveq+48] - pshuflw m0, m0, 0x0 - ; Get the values to compute the maximum value at this bit depth - pcmpeqw m5, m5 - movd m6, bpsd - psllw m5, m6 - pcmpeqw m7, m7 - pxor m6, m6 ; min possible value - pxor m5, m7 ; max possible value - punpcklqdq m0, m0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -16 - psubw m1, m0 - psubw m2, m0 - psubw m3, m0 - psubw m4, m0 -.loop: - movd m7, [leftq] - pshuflw m7, m7, 0x0 - punpcklqdq m7, m7 ; l1 l1 l1 l1 l1 l1 l1 l1 - paddw m0, m7, m1 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq ], m0 - paddw m0, m7, m2 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq +16], m0 - paddw m0, m7, m3 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq +32], m0 - paddw m0, m7, m4 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq +48], m0 - movd m7, [leftq+2] - pshuflw m7, m7, 0x0 - punpcklqdq m7, m7 ; l2 l2 l2 l2 l2 l2 l2 l2 - paddw m0, m7, m1 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2 ], m0 - paddw m0, m7, m2 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2+16], m0 - paddw m0, m7, m3 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2+32], m0 - paddw m0, m7, m4 - pminsw m0, m5 - pmaxsw m0, m6 - mova [dstq+strideq*2+48], m0 - lea dstq, [dstq+strideq*4] - lea leftq, [leftq+4] - inc lineq - jnz .loop - REP_RET diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c new file mode 100644 index 000000000..691e166cf --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c @@ -0,0 +1,1256 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); + dst += stride << 2; + left += 4; + aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd); +} + +void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); +} + +void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); + dst += stride << 3; + left += 8; + aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)bd; + h_predictor_16x8(dst, stride, left); +} + +void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_16x8(dst, stride, left); + dst += stride << 3; + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride, + const uint16_t *left) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); +} + +void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + h_predictor_32x8(dst, stride, left); + dst += stride << 3; + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP, DC_LEFT, DC_128 + +// 4x4 + +static INLINE __m128i dc_sum_4(const uint16_t *ref) { + const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x4(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 4x8 + +static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride, + const __m128i *dc) { + const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + _mm_storel_epi64((__m128i *)dst, dc_dup); + } +} + +// Shared with DC 8xh +static INLINE __m128i dc_sum_8(const uint16_t *ref) { + const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref); + const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8)); + const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe); + const __m128i a = _mm_add_epi16(_dcba, _xxdc); + + return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1)); +} + +void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sum = dc_sum_8(left); + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)left; + (void)bd; + dc_store_4x8(dst, stride, &dc); +} + +void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_4x8(dst, stride, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 8xh + +static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, const uint16_t *above) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + dc_store_8xh(dst, stride, height, &dc); +} + +void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 4, above); +} + +void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 8, above); +} + +void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + dc_top_predictor_8xh(dst, stride, 16, above); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i two = _mm_cvtsi32_si128(2); + const __m128i sum = dc_sum_4(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 4, &dc); +} + +void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 8, &dc); +} + +// Shared with DC 16xh +static INLINE __m128i dc_sum_16(const uint16_t *ref) { + const __m128i sum_lo = dc_sum_8(ref); + const __m128i sum_hi = dc_sum_8(ref + 8); + return _mm_add_epi16(sum_lo, sum_hi); +} + +void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_8xh(dst, stride, 16, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride, + int height, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + dc_store_8xh(dst, stride, height, &dc_dup); +} + +void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 4, bd); +} + +void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 8, bd); +} + +void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)above; + (void)left; + dc_128_predictor_8xh(dst, stride, 16, bd); +} + +// ----------------------------------------------------------------------------- +// 16xh + +static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + } +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i four = _mm_cvtsi32_si128(4); + const __m128i sum = dc_sum_8(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +// Shared with 32xh +static INLINE __m128i dc_sum_32(const uint16_t *ref) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sum_a = dc_sum_16(ref); + const __m128i sum_b = dc_sum_16(ref + 16); + // 12 bit bd will outrange, so expand to 32 bit before adding final total + return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero), + _mm_unpacklo_epi16(sum_b, zero)); +} + +void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 8, &dc); +} + +void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(above); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)left; + (void)bd; + dc_store_16xh(dst, stride, 32, &dc); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 8, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_16xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// 32xh + +static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height, + const __m128i *dc) { + const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0); + const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo); + int i; + for (i = 0; i < height; ++i, dst += stride) { + _mm_store_si128((__m128i *)dst, dc_dup); + _mm_store_si128((__m128i *)(dst + 8), dc_dup); + _mm_store_si128((__m128i *)(dst + 16), dc_dup); + _mm_store_si128((__m128i *)(dst + 24), dc_dup); + } +} + +void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i eight = _mm_cvtsi32_si128(8); + const __m128i sum = dc_sum_16(left); + const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(left); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)above; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 16, &dc); +} + +void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 16, &dc_dup); +} + +void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i sixteen = _mm_cvtsi32_si128(16); + const __m128i sum = dc_sum_32(above); + const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5); + (void)left; + (void)bd; + dc_store_32xh(dst, stride, 32, &dc); +} + +void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1)); + const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0); + (void)above; + (void)left; + dc_store_32xh(dst, stride, 32, &dc_dup); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above); + int i; + for (i = 0; i < 2; ++i) { + _mm_storel_epi64((__m128i *)dst, above_u16); + _mm_storel_epi64((__m128i *)(dst + stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); +} + +void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above_u16 = _mm_load_si128((const __m128i *)above); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above_u16); + _mm_store_si128((__m128i *)(dst + stride), above_u16); + _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16); + _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16); + dst += stride << 2; + } +} + +void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + dst += stride; + } +} + +void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + const __m128i above0_u16 = _mm_load_si128((const __m128i *)above); + const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24)); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + _mm_store_si128((__m128i *)dst, above0_u16); + _mm_store_si128((__m128i *)(dst + 8), above1_u16); + _mm_store_si128((__m128i *)(dst + 16), above2_u16); + _mm_store_si128((__m128i *)(dst + 24), above3_u16); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_above = dc_sum_4(above); + const __m128i sum_left = dc_sum_8(left); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + const __m128i sum_left = dc_sum_4(left); + const __m128i sum_above = dc_sum_8(above); + const __m128i sum = _mm_add_epi16(sum_above, sum_left); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 >>= 16; + sum32 += 6; + sum32 /= 12; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); +} + +void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_8(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 12; + sum32 /= 24; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 2; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_16(above); + const __m128i zero = _mm_setzero_si128(); + sum_above = _mm_unpacklo_epi16(sum_above, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + dst += stride; + } +} + +void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)bd; + __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_32(above); + const __m128i zero = _mm_setzero_si128(); + sum_left = _mm_unpacklo_epi16(sum_left, zero); + const __m128i sum = _mm_add_epi32(sum_left, sum_above); + uint32_t sum32 = _mm_cvtsi128_si32(sum); + sum32 += 24; + sum32 /= 48; + const __m128i row = _mm_set1_epi16((uint16_t)sum32); + int i; + for (i = 0; i < 4; ++i) { + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + _mm_store_si128((__m128i *)dst, row); + _mm_store_si128((__m128i *)(dst + 8), row); + _mm_store_si128((__m128i *)(dst + 16), row); + _mm_store_si128((__m128i *)(dst + 24), row); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4); + const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0); + const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00); + const __m128i row0 = _mm_srli_si128(avg2, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg2, 4); + const __m128i row3 = _mm_srli_si128(avg3, 2); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + + dst -= stride; + dst[0] = _mm_extract_epi16(avg3, 1); + dst[stride] = _mm_extract_epi16(avg3, 0); +} + +void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); + const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); + const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); + const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); + const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); + const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0); + const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC); + const __m128i row0 = _mm_srli_si128(avg3, 6); + const __m128i row1 = _mm_srli_si128(avg3, 4); + const __m128i row2 = _mm_srli_si128(avg3, 2); + const __m128i row3 = avg3; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int I = left[0]; + const int J = left[1]; + const int K = left[2]; + const int L = left[3]; + const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5)); + const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); + const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); + const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); + const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3); + const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2); + const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4); + const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00); + const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0); + const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row2 = _mm_srli_si128(row3, 4); + const __m128i row1 = _mm_srli_si128(row3, 8); + const __m128i row0 = _mm_srli_si128(avg3, 4); + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst[0] = _mm_extract_epi16(avg2, 3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); + CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); + (void)left; + (void)bd; + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); +} + +void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i h76543210 = _mm_load_si128((const __m128i *)above); + __m128i hx7654321 = _mm_srli_si128(h76543210, 2); + __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7); + __m128i hx8765432 = _mm_srli_si128(h87654321, 2); + __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7); + __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432); + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8)); + dst += stride; + + // hcba98765 + h76543210 = _mm_loadu_si128((const __m128i *)((above + 5))); + h76543210 = _mm_insert_epi16(h76543210, above[11], 7); + // hxcba9876 + hx7654321 = _mm_srli_si128(h76543210, 2); + // hxxcba987 + hx8765432 = _mm_srli_si128(h76543210, 4); + avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432); + _mm_storel_epi64((__m128i *)dst, avg3); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); + dst += stride; + _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); +} + +void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i x0 = _mm_load_si128((const __m128i *)above); + __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); + __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); + __m128i y = avg3_epu16(&x0, &x1, &x2); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x0 = _mm_loadu_si128((const __m128i *)(above + 3)); + y = avg3_epu16(&x1, &x2, &x0); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x1 = _mm_loadu_si128((const __m128i *)(above + 4)); + y = avg3_epu16(&x2, &x0, &x1); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x2 = _mm_loadu_si128((const __m128i *)(above + 5)); + x2 = _mm_insert_epi16(x2, above[11], 7); + y = avg3_epu16(&x0, &x1, &x2); + _mm_store_si128((__m128i *)dst, y); +} + +static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1, + const __m128i *a2, uint16_t **dst, + ptrdiff_t stride) { + const __m128i y = avg3_epu16(a0, a1, a2); + _mm_storeu_si128((__m128i *)*dst, y); + *dst += stride; +} + +void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i x0 = _mm_load_si128((const __m128i *)above); + __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); + __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); + + d45e_w8(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x1, &x2, &x0, &dst, stride); + + x1 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x2, &x0, &x1, &dst, stride); + + x2 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x0, &x1, &x2, &dst, stride); + } while (i < 9); + + x0 = _mm_loadu_si128((const __m128i *)(above + 9)); + x0 = _mm_insert_epi16(x0, above[15], 7); + const __m128i y = avg3_epu16(&x1, &x2, &x0); + _mm_store_si128((__m128i *)dst, y); +} + +void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + (void)left; + (void)bd; + __m128i x0 = _mm_load_si128((const __m128i *)above); + __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); + __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); + + d45e_w8(&x0, &x1, &x2, &dst, stride); + + int i = 3; + do { + x0 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x1, &x2, &x0, &dst, stride); + + x1 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x2, &x0, &x1, &dst, stride); + + x2 = _mm_loadu_si128((const __m128i *)(above + i++)); + d45e_w8(&x0, &x1, &x2, &dst, stride); + } while (i < 15); + + x0 = _mm_loadu_si128((const __m128i *)(above + 15)); + __m128i y = avg3_epu16(&x1, &x2, &x0); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x1 = _mm_loadu_si128((const __m128i *)(above + 16)); + y = avg3_epu16(&x2, &x0, &x1); + _mm_store_si128((__m128i *)dst, y); + dst += stride; + + x2 = _mm_loadu_si128((const __m128i *)(above + 17)); + x2 = _mm_insert_epi16(x2, above[23], 7); + y = avg3_epu16(&x0, &x1, &x2); + _mm_store_si128((__m128i *)dst, y); +} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c new file mode 100644 index 000000000..b089a3f43 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c @@ -0,0 +1,521 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = { + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1 +}; + +static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { + *a = _mm_shuffle_epi8(*a, *rotrw); + return *a; +} + +void aom_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i IXABCDEF = + _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); + __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); + __m128i rowa = avg2; + __m128i rowb = avg3; + int i; + (void)bd; + for (i = 0; i < 8; i += 2) { + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb); + dst += stride; + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); + } +} + +void aom_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_srli_si128(L1, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + dst += stride; + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void aom_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); + const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); + const __m128i C2 = _mm_alignr_epi8(B2, B1, 14); + const __m128i C3 = _mm_alignr_epi8(B3, B2, 14); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); + const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2); + const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2); + const __m128i L3_ = _mm_srli_si128(L3, 2); + __m128i rowa_0 = avg2_0; + __m128i rowa_1 = avg2_1; + __m128i rowa_2 = avg2_2; + __m128i rowa_3 = avg2_3; + __m128i rowb_0 = avg3_0; + __m128i rowb_1 = avg3_1; + __m128i rowb_2 = avg3_2; + __m128i rowb_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); + avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); + avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_); + avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; j += 2) { + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + _mm_store_si128((__m128i *)dst, rowb_0); + _mm_store_si128((__m128i *)(dst + 8), rowb_1); + _mm_store_si128((__m128i *)(dst + 16), rowb_2); + _mm_store_si128((__m128i *)(dst + 24), rowb_3); + dst += stride; + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14); + rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14); + rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); + rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); + } + } +} + +void aom_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); + const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); + const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); + __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + __m128i rowa = avg3; + int i; + (void)bd; + for (i = 0; i < 8; ++i) { + rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa); + dst += stride; + } +} + +void aom_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_srli_si128(B1, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i avg3_left[2]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + for (i = 0; i < 2; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + dst += stride; + } + } +} + +void aom_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_load_si128((const __m128i *)above); + const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); + const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); + const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); + const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); + const __m128i C3 = _mm_srli_si128(B3, 2); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); + const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); + const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); + const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); + __m128i rowa_0 = avg3_0; + __m128i rowa_1 = avg3_1; + __m128i rowa_2 = avg3_2; + __m128i rowa_3 = avg3_3; + __m128i avg3_left[4]; + int i, j; + (void)bd; + avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); + avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); + avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); + avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); + for (i = 0; i < 4; ++i) { + __m128i avg_left = avg3_left[i]; + for (j = 0; j < 8; ++j) { + rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); + rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); + rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); + rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); + _mm_store_si128((__m128i *)dst, rowa_0); + _mm_store_si128((__m128i *)(dst + 8), rowa_1); + _mm_store_si128((__m128i *)(dst + 16), rowa_2); + _mm_store_si128((__m128i *)(dst + 24), rowa_3); + dst += stride; + } + } +} + +void aom_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2); + const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4); + const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG); + const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); + const __m128i XIJKLMNO = + _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); + const __m128i AXIJKLMN = + _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14); + const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); + const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO); + const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left); + const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left); + const __m128i row0 = + _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12); + const __m128i row1 = + _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12); + const __m128i row2 = + _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12); + const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12); + const __m128i row4 = + _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12); + const __m128i row5 = + _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12); + const __m128i row6 = + _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12); + const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12); + (void)bd; + _mm_store_si128((__m128i *)dst, row0); + dst += stride; + _mm_store_si128((__m128i *)dst, row1); + dst += stride; + _mm_store_si128((__m128i *)dst, row2); + dst += stride; + _mm_store_si128((__m128i *)dst, row3); + dst += stride; + _mm_store_si128((__m128i *)dst, row4); + dst += stride; + _mm_store_si128((__m128i *)dst, row5); + dst += stride; + _mm_store_si128((__m128i *)dst, row6); + dst += stride; + _mm_store_si128((__m128i *)dst, row7); +} + +void aom_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_srli_si128(A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_srli_si128(A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i avg2_avg3_left[2][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + dst += stride; + } + } +} + +void aom_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); + const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); + const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_srli_si128(A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_srli_si128(A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i L0 = _mm_load_si128((const __m128i *)left); + const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); + const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); + const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); + const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); + const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); + const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); + const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12); + const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12); + const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); + const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); + const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2); + const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3); + const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); + const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); + const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2); + const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3); + __m128i row_0 = avg3_0; + __m128i row_1 = avg3_1; + __m128i row_2 = avg3_2; + __m128i row_3 = avg3_3; + __m128i avg2_avg3_left[4][2]; + int i, j; + (void)bd; + + avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); + avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); + avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2); + avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3); + avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3); + + for (j = 0; j < 4; ++j) { + for (i = 0; i < 2; ++i) { + const __m128i avg2_avg3 = avg2_avg3_left[j][i]; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + row_3 = _mm_alignr_epi8(row_3, row_2, 12); + row_2 = _mm_alignr_epi8(row_2, row_1, 12); + row_1 = _mm_alignr_epi8(row_1, row_0, 12); + row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); + _mm_store_si128((__m128i *)dst, row_0); + _mm_store_si128((__m128i *)(dst + 8), row_1); + _mm_store_si128((__m128i *)(dst + 16), row_2); + _mm_store_si128((__m128i *)(dst + 24), row_3); + dst += stride; + } + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c new file mode 100644 index 000000000..94c68885c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c @@ -0,0 +1,873 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/x86/common_avx2.h" +#include "aom_dsp/x86/lpf_common_sse2.h" +#include "aom/aom_integer.h" + +#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 +static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, + const uint8_t *t, int bd, __m256i *blt, + __m256i *lt, __m256i *thr) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); + __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + *blt = _mm256_slli_epi16(y, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); + y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + *lt = _mm256_slli_epi16(y, shift); + + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); + y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + *thr = _mm256_slli_epi16(y, shift); +} + +static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, + __m256i *p, __m256i *q) { + int i; + for (i = 0; i < size; i++) { + p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch)); + q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch)); + } +} + +static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q, + const __m256i *t, __m256i *hev) { + const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0])); + const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0])); + __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0); + h = _mm256_subs_epu16(h, *t); + + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + const __m256i zero = _mm256_setzero_si256(); + *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff); +} + +static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q, + const __m256i *l, const __m256i *bl, + __m256i *mask) { + __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0])); + __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1])); + abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); + + const __m256i zero = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff); + max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one)); + + int i; + for (i = 1; i < 4; ++i) { + max = _mm256_max_epi16(max, + _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1]))); + max = _mm256_max_epi16(max, + _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1]))); + } + max = _mm256_subs_epu16(max, *l); + *mask = _mm256_cmpeq_epi16(max, zero); // return ~mask +} + +static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p, + const __m256i *q, int bd, int start, + int end, __m256i *flat) { + __m256i max = _mm256_setzero_si256(); + int i; + for (i = start; i < end; ++i) { + max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0]))); + max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0]))); + } + + __m256i ft; + if (bd == 8) + ft = _mm256_subs_epu16(max, *th); + else if (bd == 10) + ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2)); + else // bd == 12 + ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4)); + + const __m256i zero = _mm256_setzero_si256(); + *flat = _mm256_cmpeq_epi16(ft, zero); +} + +// Note: +// Access p[3-1], p[0], and q[3-1], q[0] +static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p, + const __m256i *q, __m256i *flat, int bd) { + // check the distance 1,2,3 against 0 + flat_mask_internal(th, p, q, bd, 1, 4, flat); +} + +// Note: +// access p[7-4], p[0], and q[7-4], q[0] +static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p, + const __m256i *q, __m256i *flat, int bd) { + flat_mask_internal(th, p, q, bd, 4, 8, flat); +} + +static INLINE void pixel_clamp(const __m256i *min, const __m256i *max, + __m256i *pixel) { + __m256i clamped, mask; + + mask = _mm256_cmpgt_epi16(*pixel, *max); + clamped = _mm256_andnot_si256(mask, *pixel); + mask = _mm256_and_si256(mask, *max); + clamped = _mm256_or_si256(mask, clamped); + + mask = _mm256_cmpgt_epi16(clamped, *min); + clamped = _mm256_and_si256(mask, clamped); + mask = _mm256_andnot_si256(mask, *min); + *pixel = _mm256_or_si256(clamped, mask); +} + +static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask, + const __m256i *th, int bd, __m256i *ps, + __m256i *qs) { + __m256i t80; + if (bd == 8) + t80 = _mm256_set1_epi16(0x80); + else if (bd == 10) + t80 = _mm256_set1_epi16(0x200); + else // bd == 12 + t80 = _mm256_set1_epi16(0x800); + + __m256i ps0 = _mm256_subs_epi16(p[0], t80); + __m256i ps1 = _mm256_subs_epi16(p[1], t80); + __m256i qs0 = _mm256_subs_epi16(q[0], t80); + __m256i qs1 = _mm256_subs_epi16(q[1], t80); + + const __m256i one = _mm256_set1_epi16(1); + const __m256i pmax = _mm256_subs_epi16( + _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); + const __m256i zero = _mm256_setzero_si256(); + const __m256i pmin = _mm256_subs_epi16(zero, t80); + + __m256i filter = _mm256_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filter); + + __m256i hev; + highbd_hev_mask(p, q, th, &hev); + filter = _mm256_and_si256(filter, hev); + + const __m256i x = _mm256_subs_epi16(qs0, ps0); + filter = _mm256_adds_epi16(filter, x); + filter = _mm256_adds_epi16(filter, x); + filter = _mm256_adds_epi16(filter, x); + pixel_clamp(&pmin, &pmax, &filter); + filter = _mm256_and_si256(filter, *mask); + + const __m256i t3 = _mm256_set1_epi16(3); + const __m256i t4 = _mm256_set1_epi16(4); + + __m256i filter1 = _mm256_adds_epi16(filter, t4); + __m256i filter2 = _mm256_adds_epi16(filter, t3); + pixel_clamp(&pmin, &pmax, &filter1); + pixel_clamp(&pmin, &pmax, &filter2); + filter1 = _mm256_srai_epi16(filter1, 3); + filter2 = _mm256_srai_epi16(filter2, 3); + + qs0 = _mm256_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &qs0); + ps0 = _mm256_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &ps0); + + qs[0] = _mm256_adds_epi16(qs0, t80); + ps[0] = _mm256_adds_epi16(ps0, t80); + + filter = _mm256_adds_epi16(filter1, one); + filter = _mm256_srai_epi16(filter, 1); + filter = _mm256_andnot_si256(hev, filter); + + qs1 = _mm256_subs_epi16(qs1, filter); + pixel_clamp(&pmin, &pmax, &qs1); + ps1 = _mm256_adds_epi16(ps1, filter); + pixel_clamp(&pmin, &pmax, &ps1); + + qs[1] = _mm256_adds_epi16(qs1, t80); + ps[1] = _mm256_adds_epi16(ps1, t80); +} +#endif // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 + +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 +void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd); +} + +void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, + const uint8_t *blt, const uint8_t *lt, + const uint8_t *thr, int bd) { + aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd); +} + +void aom_highbd_lpf_horizontal_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_horizontal_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} + +void aom_highbd_lpf_vertical_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); +} +#else +void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + __m256i blimit, limit, thresh; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); + + __m256i p[8], q[8]; + load_highbd_pixel(s, 8, pitch, p, q); + + __m256i mask; + highbd_filter_mask(p, q, &limit, &blimit, &mask); + + __m256i flat, flat2; + const __m256i one = _mm256_set1_epi16(1); + highbd_flat_mask4(&one, p, q, &flat, bd); + highbd_flat_mask5(&one, p, q, &flat2, bd); + + flat = _mm256_and_si256(flat, mask); + flat2 = _mm256_and_si256(flat2, flat); + + __m256i ps[2], qs[2]; + highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); + + // flat and wide flat calculations + __m256i flat_p[3], flat_q[3]; + __m256i flat2_p[7], flat2_q[7]; + { + const __m256i eight = _mm256_set1_epi16(8); + const __m256i four = _mm256_set1_epi16(4); + + __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]), + _mm256_add_epi16(p[4], p[3])); + __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]), + _mm256_add_epi16(q[4], q[3])); + + __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1])); + sum_p = _mm256_add_epi16(sum_p, sum_lp); + + __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1])); + sum_q = _mm256_add_epi16(sum_q, sum_lq); + sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q)); + sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq)); + + flat2_p[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4); + flat2_q[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4); + flat_p[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3); + flat_q[0] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3); + + __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]); + __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]); + __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]); + __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]); + + sum_q = _mm256_sub_epi16(sum_p, p[6]); + sum_p = _mm256_sub_epi16(sum_p, q[6]); + flat2_p[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4); + flat2_q[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4); + + sum_lq = _mm256_sub_epi16(sum_lp, p[2]); + sum_lp = _mm256_sub_epi16(sum_lp, q[2]); + flat_p[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3); + flat_q[1] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3); + + sum_p7 = _mm256_add_epi16(sum_p7, p[7]); + sum_q7 = _mm256_add_epi16(sum_q7, q[7]); + sum_p3 = _mm256_add_epi16(sum_p3, p[3]); + sum_q3 = _mm256_add_epi16(sum_q3, q[3]); + + sum_p = _mm256_sub_epi16(sum_p, q[5]); + sum_q = _mm256_sub_epi16(sum_q, p[5]); + flat2_p[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4); + flat2_q[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4); + + sum_lp = _mm256_sub_epi16(sum_lp, q[1]); + sum_lq = _mm256_sub_epi16(sum_lq, p[1]); + flat_p[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3); + flat_q[2] = _mm256_srli_epi16( + _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3); + + int i; + for (i = 3; i < 7; ++i) { + sum_p7 = _mm256_add_epi16(sum_p7, p[7]); + sum_q7 = _mm256_add_epi16(sum_q7, q[7]); + sum_p = _mm256_sub_epi16(sum_p, q[7 - i]); + sum_q = _mm256_sub_epi16(sum_q, p[7 - i]); + flat2_p[i] = _mm256_srli_epi16( + _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4); + flat2_q[i] = _mm256_srli_epi16( + _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4); + } + } + + // highbd_filter8 + p[2] = _mm256_andnot_si256(flat, p[2]); + // p2 remains unchanged if !(flat && mask) + flat_p[2] = _mm256_and_si256(flat, flat_p[2]); + // when (flat && mask) + p[2] = _mm256_or_si256(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm256_andnot_si256(flat, q[2]); + flat_q[2] = _mm256_and_si256(flat, flat_q[2]); + q[2] = _mm256_or_si256(q[2], flat_q[2]); // full list of q2 values + + int i; + for (i = 1; i >= 0; i--) { + ps[i] = _mm256_andnot_si256(flat, ps[i]); + flat_p[i] = _mm256_and_si256(flat, flat_p[i]); + p[i] = _mm256_or_si256(ps[i], flat_p[i]); + qs[i] = _mm256_andnot_si256(flat, qs[i]); + flat_q[i] = _mm256_and_si256(flat, flat_q[i]); + q[i] = _mm256_or_si256(qs[i], flat_q[i]); + } + + // highbd_filter16 + + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm256_andnot_si256(flat2, p[i]); + flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm256_or_si256(p[i], flat2_p[i]); // full list of p values + + q[i] = _mm256_andnot_si256(flat2, q[i]); + flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]); + q[i] = _mm256_or_si256(q[i], flat2_q[i]); + _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]); + _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]); + } +} + +static INLINE void highbd_transpose16x16(uint16_t *src, int src_p, + uint16_t *dst, int dst_p) { + __m256i x[16]; + int i; + for (i = 0; i < 16; ++i) { + x[i] = _mm256_loadu_si256((const __m256i *)src); + src += src_p; + } + mm256_transpose_16x16(x, x); + for (i = 0; i < 16; ++i) { + _mm256_storeu_si256((__m256i *)dst, x[i]); + dst += dst_p; + } +} + +void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[256]); + + // Transpose 16x16 + highbd_transpose16x16(s - 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_edge_16_avx2(t_dst + 8 * 16, 16, blimit, limit, + thresh, bd); + + // Transpose back + highbd_transpose16x16(t_dst, 16, s - 8, p); +} + +static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0, + const uint8_t *t0, const uint8_t *b1, + const uint8_t *l1, const uint8_t *t1, int bd, + __m256i *blt, __m256i *lt, __m256i *thr) { + const __m128i z128 = _mm_setzero_si128(); + const __m128i blimit0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128); + const __m128i limit0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128); + const __m128i thresh0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128); + const __m128i blimit1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128); + const __m128i limit1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128); + const __m128i thresh1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128); + + *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1); + *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1); + *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1); + + int shift = bd - 8; + *blt = _mm256_slli_epi16(*blt, shift); + *lt = _mm256_slli_epi16(*lt, shift); + *thr = _mm256_slli_epi16(*thr, shift); +} + +void aom_highbd_lpf_horizontal_4_dual_avx2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); + __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); + __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); + __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); + __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p)); + __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); + __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); + __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); + + const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); + const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); + + __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); + __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); + + __m256i blimit, limit, thresh; + get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit, &limit, &thresh); + + __m256i t80, tff80, tffe0, t1f, t7f; + if (bd == 8) { + t80 = _mm256_set1_epi16(0x80); + tff80 = _mm256_set1_epi16(0xff80); + tffe0 = _mm256_set1_epi16(0xffe0); + t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8); + t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8); + } else if (bd == 10) { + t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2); + tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2); + tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2); + t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6); + t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6); + } else { // bd == 12 + t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4); + tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4); + tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4); + t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4); + t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4); + } + + __m256i ps1 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80); + __m256i ps0 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80); + __m256i qs0 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80); + __m256i qs1 = + _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80); + + // filter_mask and hev_mask + const __m256i zero = _mm256_setzero_si256(); + __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); + __m256i hev = _mm256_subs_epu16(flat, thresh); + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); + __m256i mask = + _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + const __m256i one = _mm256_set1_epi16(1); + mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); + mask = _mm256_max_epi16(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + __m256i work = _mm256_max_epi16( + _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)), + _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3))); + mask = _mm256_max_epi16(work, mask); + work = _mm256_max_epi16( + _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)), + _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3))); + mask = _mm256_max_epi16(work, mask); + mask = _mm256_subs_epu16(mask, limit); + mask = _mm256_cmpeq_epi16(mask, zero); + + // filter4 + const __m256i pmax = _mm256_subs_epi16( + _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); + const __m256i pmin = _mm256_subs_epi16(zero, t80); + + __m256i filt = _mm256_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm256_and_si256(filt, hev); + __m256i work_a = _mm256_subs_epi16(qs0, ps0); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + pixel_clamp(&pmin, &pmax, &filt); + + // (aom_filter + 3 * (qs0 - ps0)) & mask + filt = _mm256_and_si256(filt, mask); + + const __m256i t4 = _mm256_set1_epi16(4); + const __m256i t3 = _mm256_set1_epi16(3); + + __m256i filter1 = _mm256_adds_epi16(filt, t4); + pixel_clamp(&pmin, &pmax, &filter1); + __m256i filter2 = _mm256_adds_epi16(filt, t3); + pixel_clamp(&pmin, &pmax, &filter2); + + // Filter1 >> 3 + work_a = _mm256_cmpgt_epi16(zero, filter1); // get the values that are <0 + filter1 = _mm256_srli_epi16(filter1, 3); + work_a = _mm256_and_si256(work_a, tffe0); // sign bits for the values < 0 + filter1 = _mm256_and_si256(filter1, t1f); // clamp the range + filter1 = _mm256_or_si256(filter1, work_a); // reinsert the sign bits + + // Filter2 >> 3 + work_a = _mm256_cmpgt_epi16(zero, filter2); + filter2 = _mm256_srli_epi16(filter2, 3); + work_a = _mm256_and_si256(work_a, tffe0); + filter2 = _mm256_and_si256(filter2, t1f); + filter2 = _mm256_or_si256(filter2, work_a); + + // filt >> 1 + // equivalent to shifting 0x1f left by bitdepth - 8 + // and setting new bits to 1 + filt = _mm256_adds_epi16(filter1, one); + work_a = _mm256_cmpgt_epi16(zero, filt); + filt = _mm256_srli_epi16(filt, 1); + work_a = _mm256_and_si256(work_a, tff80); + filt = _mm256_and_si256(filt, t7f); + filt = _mm256_or_si256(filt, work_a); + + filt = _mm256_andnot_si256(hev, filt); + + filter1 = _mm256_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &filter1); + q0 = _mm256_adds_epi16(filter1, t80); + + filter1 = _mm256_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &filter1); + q1 = _mm256_adds_epi16(filter1, t80); + + filter2 = _mm256_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &filter2); + p0 = _mm256_adds_epi16(filter2, t80); + + filter2 = _mm256_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &filter2); + p1 = _mm256_adds_epi16(filter2, t80); + + _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); + _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); + _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); + _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); +} + +void aom_highbd_lpf_horizontal_8_dual_avx2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); + DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); + + __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); + __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); + __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); + __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); + __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); + __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); + __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); + __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p)); + + __m256i blimit, limit, thresh; + get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit, &limit, &thresh); + + __m256i t80; + if (bd == 8) { + t80 = _mm256_set1_epi16(0x80); + } else if (bd == 10) { + t80 = _mm256_set1_epi16(0x200); + } else { // bd == 12 + t80 = _mm256_set1_epi16(0x800); + } + + __m256i ps1, ps0, qs0, qs1; + ps1 = _mm256_subs_epi16(p1, t80); + ps0 = _mm256_subs_epi16(p0, t80); + qs0 = _mm256_subs_epi16(q0, t80); + qs1 = _mm256_subs_epi16(q1, t80); + + // filter_mask and hev_mask + __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); + abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); + + abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); + abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); + __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); + __m256i hev = _mm256_subs_epu16(flat, thresh); + const __m256i zero = _mm256_set1_epi16(0); + const __m256i ffff = _mm256_set1_epi16(0xFFFF); + hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); + + abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); + __m256i mask = + _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + + const __m256i one = _mm256_set1_epi16(1); + mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); + mask = _mm256_max_epi16(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + mask = _mm256_max_epi16(abs_q1q0, mask); + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)), + _mm256_abs_epi16(_mm256_sub_epi16(q2, q1))); + mask = _mm256_max_epi16(work, mask); + work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)), + _mm256_abs_epi16(_mm256_sub_epi16(q3, q2))); + mask = _mm256_max_epi16(work, mask); + mask = _mm256_subs_epu16(mask, limit); + mask = _mm256_cmpeq_epi16(mask, zero); + + // flat_mask4 + flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)), + _mm256_abs_epi16(_mm256_sub_epi16(q2, q0))); + work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)), + _mm256_abs_epi16(_mm256_sub_epi16(q3, q0))); + flat = _mm256_max_epi16(work, flat); + flat = _mm256_max_epi16(abs_p1p0, flat); + flat = _mm256_max_epi16(abs_q1q0, flat); + + if (bd == 8) + flat = _mm256_subs_epu16(flat, one); + else if (bd == 10) + flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2)); + else // bd == 12 + flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4)); + + flat = _mm256_cmpeq_epi16(flat, zero); + flat = _mm256_and_si256(flat, mask); // flat & mask + + // Added before shift for rounding part of ROUND_POWER_OF_TWO + __m256i workp_a, workp_b, workp_shft; + workp_a = + _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1)); + const __m256i four = _mm256_set1_epi16(4); + workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0); + workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft); + + workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft); + + workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3); + workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2); + workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); + _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft); + + // lp filter + const __m256i pmax = _mm256_subs_epi16( + _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); + const __m256i pmin = _mm256_subs_epi16(zero, t80); + + __m256i filt, filter1, filter2, work_a; + filt = _mm256_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm256_and_si256(filt, hev); + work_a = _mm256_subs_epi16(qs0, ps0); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + filt = _mm256_adds_epi16(filt, work_a); + // (aom_filter + 3 * (qs0 - ps0)) & mask + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm256_and_si256(filt, mask); + + const __m256i t4 = _mm256_set1_epi16(4); + const __m256i t3 = _mm256_set1_epi16(3); + + filter1 = _mm256_adds_epi16(filt, t4); + filter2 = _mm256_adds_epi16(filt, t3); + + // Filter1 >> 3 + pixel_clamp(&pmin, &pmax, &filter1); + filter1 = _mm256_srai_epi16(filter1, 3); + + // Filter2 >> 3 + pixel_clamp(&pmin, &pmax, &filter2); + filter2 = _mm256_srai_epi16(filter2, 3); + + // filt >> 1 + filt = _mm256_adds_epi16(filter1, one); + filt = _mm256_srai_epi16(filt, 1); + // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; + filt = _mm256_andnot_si256(hev, filt); + + work_a = _mm256_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + q0 = _mm256_loadu_si256((__m256i *)flat_oq0); + work_a = _mm256_andnot_si256(flat, work_a); + q0 = _mm256_and_si256(flat, q0); + q0 = _mm256_or_si256(work_a, q0); + + work_a = _mm256_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + q1 = _mm256_loadu_si256((__m256i *)flat_oq1); + work_a = _mm256_andnot_si256(flat, work_a); + q1 = _mm256_and_si256(flat, q1); + q1 = _mm256_or_si256(work_a, q1); + + work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p)); + q2 = _mm256_loadu_si256((__m256i *)flat_oq2); + work_a = _mm256_andnot_si256(flat, work_a); + q2 = _mm256_and_si256(flat, q2); + q2 = _mm256_or_si256(work_a, q2); + + work_a = _mm256_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + p0 = _mm256_loadu_si256((__m256i *)flat_op0); + work_a = _mm256_andnot_si256(flat, work_a); + p0 = _mm256_and_si256(flat, p0); + p0 = _mm256_or_si256(work_a, p0); + + work_a = _mm256_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &work_a); + work_a = _mm256_adds_epi16(work_a, t80); + p1 = _mm256_loadu_si256((__m256i *)flat_op1); + work_a = _mm256_andnot_si256(flat, work_a); + p1 = _mm256_and_si256(flat, p1); + p1 = _mm256_or_si256(work_a, p1); + + work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p)); + p2 = _mm256_loadu_si256((__m256i *)flat_op2); + work_a = _mm256_andnot_si256(flat, work_a); + p2 = _mm256_and_si256(flat, p2); + p2 = _mm256_or_si256(work_a, p2); + + _mm256_storeu_si256((__m256i *)(s - 3 * p), p2); + _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); + _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); + _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); + _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); + _mm256_storeu_si256((__m256i *)(s + 2 * p), q2); +} + +void aom_highbd_lpf_vertical_4_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} + +void aom_highbd_lpf_vertical_8_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); + uint16_t *src[2]; + uint16_t *dst[2]; + + // Transpose 8x16 + highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + + // Loop filtering + aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + src[0] = t_dst; + src[1] = t_dst + 8; + + dst[0] = s - 4; + dst[1] = s - 4 + p * 8; + + // Transpose back + highbd_transpose(src, 16, dst, p, 2); +} +#endif // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c index 76369871b..0a399edf2 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -12,135 +12,135 @@ #include // SSE2 #include "./aom_dsp_rtcd.h" -#include "aom_ports/mem.h" +#include "aom_dsp/x86/lpf_common_sse2.h" #include "aom_ports/emmintrin_compat.h" +#include "aom_ports/mem.h" -static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { - __m128i ubounded; - __m128i lbounded; - __m128i retval; +static INLINE void pixel_clamp(const __m128i *min, const __m128i *max, + __m128i *pixel) { + __m128i clamped, mask; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - __m128i t80, max, min; + mask = _mm_cmpgt_epi16(*pixel, *max); + clamped = _mm_andnot_si128(mask, *pixel); + mask = _mm_and_si128(mask, *max); + clamped = _mm_or_si128(mask, clamped); - if (bd == 8) { - t80 = _mm_set1_epi16(0x80); - max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80); - } else if (bd == 10) { - t80 = _mm_set1_epi16(0x200); - max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80); - } else { // bd == 12 - t80 = _mm_set1_epi16(0x800); - max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80); - } + mask = _mm_cmpgt_epi16(clamped, *min); + clamped = _mm_and_si128(mask, clamped); + mask = _mm_andnot_si128(mask, *min); + *pixel = _mm_or_si128(clamped, mask); +} - min = _mm_subs_epi16(zero, t80); +static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, + const uint8_t *t, int bd, __m128i *blt, + __m128i *lt, __m128i *thr) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); - ubounded = _mm_cmpgt_epi16(value, max); - lbounded = _mm_cmplt_epi16(value, min); - retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value); - ubounded = _mm_and_si128(ubounded, max); - lbounded = _mm_and_si128(lbounded, min); - retval = _mm_or_si128(retval, ubounded); - retval = _mm_or_si128(retval, lbounded); - return retval; -} + __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); + *blt = _mm_slli_epi16(x, shift); -// TODO(debargha, peter): Break up large functions into smaller ones -// in this file. -void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi16(1); - __m128i blimit, limit, thresh; - __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; - __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; - __m128i ps1, qs1, ps0, qs0; - __m128i abs_p0q0, abs_p1q1, ffff, work; - __m128i filt, work_a, filter1, filter2; - __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4; - __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1; - __m128i flat2_q0, flat2_p0; - __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3; - __m128i t4, t3, t80, t1; - __m128i eight, four; + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); + *lt = _mm_slli_epi16(x, shift); - if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); - } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); - } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); + *thr = _mm_slli_epi16(x, shift); +} + +static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, + __m128i *p, __m128i *q) { + int i; + for (i = 0; i < size; i++) { + p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); } +} +// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); +static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q, + const __m128i *t, __m128i *hev) { + const __m128i abs_p1p0 = + _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1])); + const __m128i abs_q1q0 = + _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1])); + __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); + h = _mm_subs_epu16(h, *t); - q4 = _mm_load_si128((__m128i *)(s + 4 * p)); - p4 = _mm_load_si128((__m128i *)(s - 5 * p)); - q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - q0 = _mm_load_si128((__m128i *)(s + 0 * p)); - p0 = _mm_load_si128((__m128i *)(s - 1 * p)); - - // highbd_filter_mask - abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); - abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + const __m128i zero = _mm_setzero_si128(); + *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); +} - ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); +static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q, + const __m128i *l, const __m128i *bl, + __m128i *mask) { + __m128i abs_p0q0 = + _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0])); + __m128i abs_p1q1 = + _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1])); + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); - abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + int i; + for (i = 1; i < 4; ++i) { + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]), + _mm_subs_epu16(p[i - 1], p[i]))); + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]), + _mm_subs_epu16(q[i - 1], q[i]))); + } + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // return ~mask +} - // highbd_hev_mask (in C code this is actually called from highbd_filter4) - flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); +static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p, + const __m128i *q, int bd, int start, + int end, __m128i *flat) { + __m128i max = _mm_setzero_si128(); + int i; + for (i = start; i < end; ++i) { + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]), + _mm_subs_epu16(p[0], p[i]))); + max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]), + _mm_subs_epu16(q[0], q[i]))); + } - abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), - _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); - mask = _mm_max_epi16(work, mask); + __m128i ft; + if (bd == 8) + ft = _mm_subs_epu16(max, *th); + else if (bd == 10) + ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2)); + else // bd == 12 + ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4)); - mask = _mm_subs_epu16(mask, limit); - mask = _mm_cmpeq_epi16(mask, zero); // return ~mask + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} - // lp filter - // highbd_filter4 - t4 = _mm_set1_epi16(4); - t3 = _mm_set1_epi16(3); +// Note: +// Access p[3-1], p[0], and q[3-1], q[0] +static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p, + const __m128i *q, __m128i *flat, int bd) { + // check the distance 1,2,3 against 0 + flat_mask_internal(th, p, q, bd, 1, 4, flat); +} + +// Note: +// access p[7-4], p[0], and q[7-4], q[0] +static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p, + const __m128i *q, __m128i *flat, int bd) { + flat_mask_internal(th, p, q, bd, 4, 8, flat); +} + +static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask, + const __m128i *th, int bd, __m128i *ps, + __m128i *qs) { + __m128i t80; if (bd == 8) t80 = _mm_set1_epi16(0x80); else if (bd == 10) @@ -148,340 +148,283 @@ void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, else // bd == 12 t80 = _mm_set1_epi16(0x800); - t1 = _mm_set1_epi16(0x1); + __m128i ps0 = _mm_subs_epi16(p[0], t80); + __m128i ps1 = _mm_subs_epi16(p[1], t80); + __m128i qs0 = _mm_subs_epi16(q[0], t80); + __m128i qs1 = _mm_subs_epi16(q[1], t80); - ps1 = _mm_subs_epi16(p1, t80); - qs1 = _mm_subs_epi16(q1, t80); - ps0 = _mm_subs_epi16(p0, t80); - qs0 = _mm_subs_epi16(q0, t80); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); + const __m128i zero = _mm_setzero_si128(); + const __m128i pmin = _mm_subs_epi16(zero, t80); - filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), - hev); - work_a = _mm_subs_epi16(qs0, ps0); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); - filt = _mm_and_si128(filt, mask); - filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); - filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + __m128i filter = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filter); - // Filter1 >> 3 - filter1 = _mm_srai_epi16(filter1, 0x3); - filter2 = _mm_srai_epi16(filter2, 0x3); + __m128i hev; + highbd_hev_mask(p, q, th, &hev); + filter = _mm_and_si128(filter, hev); - qs0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); - ps0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(hev, filt); - qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), - t80); - ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), - t80); + const __m128i x = _mm_subs_epi16(qs0, ps0); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + filter = _mm_adds_epi16(filter, x); + pixel_clamp(&pmin, &pmax, &filter); + filter = _mm_and_si128(filter, *mask); - // end highbd_filter4 - // loopfilter done + const __m128i t3 = _mm_set1_epi16(3); + const __m128i t4 = _mm_set1_epi16(4); - // highbd_flat_mask4 - flat = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), - _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3))); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)), - _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); - flat = _mm_max_epi16(work, flat); - work = _mm_max_epi16(abs_p1p0, abs_q1q0); - flat = _mm_max_epi16(work, flat); + __m128i filter1 = _mm_adds_epi16(filter, t4); + __m128i filter2 = _mm_adds_epi16(filter, t3); + pixel_clamp(&pmin, &pmax, &filter1); + pixel_clamp(&pmin, &pmax, &filter2); + filter1 = _mm_srai_epi16(filter1, 3); + filter2 = _mm_srai_epi16(filter2, 3); - if (bd == 8) - flat = _mm_subs_epu16(flat, one); - else if (bd == 10) - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); - else // bd == 12 - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + qs0 = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &qs0); + ps0 = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &ps0); - flat = _mm_cmpeq_epi16(flat, zero); - // end flat_mask4 + qs[0] = _mm_adds_epi16(qs0, t80); + ps[0] = _mm_adds_epi16(ps0, t80); - // flat & mask = flat && mask (as used in filter8) - // (because, in both vars, each block of 16 either all 1s or all 0s) - flat = _mm_and_si128(flat, mask); + filter = _mm_adds_epi16(filter1, one); + filter = _mm_srai_epi16(filter, 1); + filter = _mm_andnot_si128(hev, filter); - p5 = _mm_load_si128((__m128i *)(s - 6 * p)); - q5 = _mm_load_si128((__m128i *)(s + 5 * p)); - p6 = _mm_load_si128((__m128i *)(s - 7 * p)); - q6 = _mm_load_si128((__m128i *)(s + 6 * p)); - p7 = _mm_load_si128((__m128i *)(s - 8 * p)); - q7 = _mm_load_si128((__m128i *)(s + 7 * p)); + qs1 = _mm_subs_epi16(qs1, filter); + pixel_clamp(&pmin, &pmax, &qs1); + ps1 = _mm_adds_epi16(ps1, filter); + pixel_clamp(&pmin, &pmax, &ps1); - // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 - // but referred to as p0-p4 & q0-q4 in fn) - flat2 = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)), - _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4))); + qs[1] = _mm_adds_epi16(qs1, t80); + ps[1] = _mm_adds_epi16(ps1, t80); +} - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)), - _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5))); - flat2 = _mm_max_epi16(work, flat2); +typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput; - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)), - _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6))); - flat2 = _mm_max_epi16(work, flat2); +static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd, + PixelOutput pixel_output) { + __m128i blimit, limit, thresh; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)), - _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7))); - flat2 = _mm_max_epi16(work, flat2); + __m128i p[8], q[8]; + load_highbd_pixel(s, 8, pitch, p, q); - if (bd == 8) - flat2 = _mm_subs_epu16(flat2, one); - else if (bd == 10) - flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2)); - else // bd == 12 - flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4)); + __m128i mask; + highbd_filter_mask(p, q, &limit, &blimit, &mask); + + __m128i flat, flat2; + const __m128i one = _mm_set1_epi16(1); + highbd_flat_mask4(&one, p, q, &flat, bd); + highbd_flat_mask5(&one, p, q, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); - flat2 = _mm_cmpeq_epi16(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - // end highbd_flat_mask5 + __m128i ps[2], qs[2]; + highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations - eight = _mm_set1_epi16(8); - four = _mm_set1_epi16(4); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = - _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16( - four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - flat2_p0 = - _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4); - flat2_q0 = - _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4); - flat_p0 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3); - flat_q0 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3); - - sum_p7 = _mm_add_epi16(p7, p7); - sum_q7 = _mm_add_epi16(q7, q7); - sum_p3 = _mm_add_epi16(p3, p3); - sum_q3 = _mm_add_epi16(q3, q3); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6); - flat2_p1 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4); - flat2_q1 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2); - flat_p1 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3); - flat_q1 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - sum_p3 = _mm_add_epi16(sum_p3, p3); - sum_q3 = _mm_add_epi16(sum_q3, q3); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5); - flat2_p2 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4); - flat2_q2 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1); - flat_p2 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3); - flat_q2 = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4); - flat2_p3 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4); - flat2_q3 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3); - flat2_p4 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4); - flat2_q4 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2); - flat2_p5 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4); - flat2_q5 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4); - - sum_p7 = _mm_add_epi16(sum_p7, p7); - sum_q7 = _mm_add_epi16(sum_q7, q7); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1); - flat2_p6 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4); - flat2_q6 = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4); - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - // highbd_filter8 - p2 = _mm_andnot_si128(flat, p2); + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[7], flat2_q[7]; + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + + __m128i sum_p = + _mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3])); + __m128i sum_q = + _mm_add_epi16(_mm_add_epi16(q[6], q[5]), _mm_add_epi16(q[4], q[3])); + + __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); + sum_q = _mm_add_epi16(sum_q, sum_lq); + sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat2_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(p[7], p[0])), 4); + flat2_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(q[7], q[0])), 4); + flat_p[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); + flat_q[0] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); + + __m128i sum_p7 = _mm_add_epi16(p[7], p[7]); + __m128i sum_q7 = _mm_add_epi16(q[7], q[7]); + __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); + __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); + + sum_q = _mm_sub_epi16(sum_p, p[6]); + sum_p = _mm_sub_epi16(sum_p, q[6]); + flat2_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[1])), 4); + flat2_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[1])), 4); + + sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + flat_p[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); + flat_q[1] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); + + sum_p7 = _mm_add_epi16(sum_p7, p[7]); + sum_q7 = _mm_add_epi16(sum_q7, q[7]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); + + sum_p = _mm_sub_epi16(sum_p, q[5]); + sum_q = _mm_sub_epi16(sum_q, p[5]); + flat2_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[2])), 4); + flat2_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[2])), 4); + + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, p[1]); + flat_p[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); + flat_q[2] = + _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); + + int i; + for (i = 3; i < 7; ++i) { + sum_p7 = _mm_add_epi16(sum_p7, p[7]); + sum_q7 = _mm_add_epi16(sum_q7, q[7]); + sum_p = _mm_sub_epi16(sum_p, q[7 - i]); + sum_q = _mm_sub_epi16(sum_q, p[7 - i]); + flat2_p[i] = + _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[i])), 4); + flat2_q[i] = + _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4); + } + } + + // highbd_filter8 + p[2] = _mm_andnot_si128(flat, p[2]); // p2 remains unchanged if !(flat && mask) - flat_p2 = _mm_and_si128(flat, flat_p2); + flat_p[2] = _mm_and_si128(flat, flat_p[2]); // when (flat && mask) - p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values - q2 = _mm_andnot_si128(flat, q2); - flat_q2 = _mm_and_si128(flat, flat_q2); - q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values - - ps1 = _mm_andnot_si128(flat, ps1); - // p1 takes the value assigned to in in filter4 if !(flat && mask) - flat_p1 = _mm_and_si128(flat, flat_p1); - // when (flat && mask) - p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values - qs1 = _mm_andnot_si128(flat, qs1); - flat_q1 = _mm_and_si128(flat, flat_q1); - q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values - - ps0 = _mm_andnot_si128(flat, ps0); - // p0 takes the value assigned to in in filter4 if !(flat && mask) - flat_p0 = _mm_and_si128(flat, flat_p0); - // when (flat && mask) - p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values - qs0 = _mm_andnot_si128(flat, qs0); - flat_q0 = _mm_and_si128(flat, flat_q0); - q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values - // end highbd_filter8 + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values + + int i; + for (i = 1; i >= 0; i--) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } // highbd_filter16 - p6 = _mm_andnot_si128(flat2, p6); - // p6 remains unchanged if !(flat2 && flat && mask) - flat2_p6 = _mm_and_si128(flat2, flat2_p6); - // get values for when (flat2 && flat && mask) - p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values - q6 = _mm_andnot_si128(flat2, q6); - // q6 remains unchanged if !(flat2 && flat && mask) - flat2_q6 = _mm_and_si128(flat2, flat2_q6); - // get values for when (flat2 && flat && mask) - q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values - _mm_store_si128((__m128i *)(s - 7 * p), p6); - _mm_store_si128((__m128i *)(s + 6 * p), q6); - - p5 = _mm_andnot_si128(flat2, p5); - // p5 remains unchanged if !(flat2 && flat && mask) - flat2_p5 = _mm_and_si128(flat2, flat2_p5); - // get values for when (flat2 && flat && mask) - p5 = _mm_or_si128(p5, flat2_p5); - // full list of p5 values - q5 = _mm_andnot_si128(flat2, q5); - // q5 remains unchanged if !(flat2 && flat && mask) - flat2_q5 = _mm_and_si128(flat2, flat2_q5); - // get values for when (flat2 && flat && mask) - q5 = _mm_or_si128(q5, flat2_q5); - // full list of q5 values - _mm_store_si128((__m128i *)(s - 6 * p), p5); - _mm_store_si128((__m128i *)(s + 5 * p), q5); - - p4 = _mm_andnot_si128(flat2, p4); - // p4 remains unchanged if !(flat2 && flat && mask) - flat2_p4 = _mm_and_si128(flat2, flat2_p4); - // get values for when (flat2 && flat && mask) - p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values - q4 = _mm_andnot_si128(flat2, q4); - // q4 remains unchanged if !(flat2 && flat && mask) - flat2_q4 = _mm_and_si128(flat2, flat2_q4); - // get values for when (flat2 && flat && mask) - q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values - _mm_store_si128((__m128i *)(s - 5 * p), p4); - _mm_store_si128((__m128i *)(s + 4 * p), q4); - - p3 = _mm_andnot_si128(flat2, p3); - // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p3 = _mm_and_si128(flat2, flat2_p3); - // get values for when (flat2 && flat && mask) - p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values - q3 = _mm_andnot_si128(flat2, q3); - // q3 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q3 = _mm_and_si128(flat2, flat2_q3); - // get values for when (flat2 && flat && mask) - q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values - _mm_store_si128((__m128i *)(s - 4 * p), p3); - _mm_store_si128((__m128i *)(s + 3 * p), q3); - - p2 = _mm_andnot_si128(flat2, p2); - // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p2 = _mm_and_si128(flat2, flat2_p2); - // get values for when (flat2 && flat && mask) - p2 = _mm_or_si128(p2, flat2_p2); - // full list of p2 values - q2 = _mm_andnot_si128(flat2, q2); - // q2 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q2 = _mm_and_si128(flat2, flat2_q2); - // get values for when (flat2 && flat && mask) - q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s + 2 * p), q2); - - p1 = _mm_andnot_si128(flat2, p1); - // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p1 = _mm_and_si128(flat2, flat2_p1); - // get values for when (flat2 && flat && mask) - p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values - q1 = _mm_andnot_si128(flat2, q1); - // q1 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q1 = _mm_and_si128(flat2, flat2_q1); - // get values for when (flat2 && flat && mask) - q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s + 1 * p), q1); - - p0 = _mm_andnot_si128(flat2, p0); - // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_p0 = _mm_and_si128(flat2, flat2_p0); - // get values for when (flat2 && flat && mask) - p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values - q0 = _mm_andnot_si128(flat2, q0); - // q0 takes value from highbd_filter8 if !(flat2 && flat && mask) - flat2_q0 = _mm_and_si128(flat2, flat2_q0); - // get values for when (flat2 && flat && mask) - q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s - 0 * p), q0); + + if (pixel_output == FOUR_PIXELS) { + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]); + } + } else { // EIGHT_PIXELS + for (i = 6; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_store_si128((__m128i *)(s + i * pitch), q[i]); + } + } +} + +// Note: +// highbd_lpf_horz_edge_8_8p() output 8 pixels per register +// highbd_lpf_horz_edge_8_4p() output 4 pixels per register +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 +static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS); +} +#endif // #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + +static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch, + const uint8_t *blt, + const uint8_t *lt, + const uint8_t *thr, int bd) { + highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS); +} + +void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); +#else + highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); +#endif } void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh, int bd) { - aom_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd); - aom_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd); +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); +#else + highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); + highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd); +#endif +} + +static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1, + const __m128i *p0, const __m128i *q0, + const __m128i *q1, const __m128i *q2, + int p, uint16_t *s) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + _mm_storel_epi64((__m128i *)(s - 3 * p), *p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), *p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), *p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), *q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), *q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), *q2); +#else + _mm_store_si128((__m128i *)(s - 3 * p), *p2); + _mm_store_si128((__m128i *)(s - 2 * p), *p1); + _mm_store_si128((__m128i *)(s - 1 * p), *p0); + _mm_store_si128((__m128i *)(s + 0 * p), *q0); + _mm_store_si128((__m128i *)(s + 1 * p), *q1); + _mm_store_si128((__m128i *)(s + 2 * p), *q2); +#endif } void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, @@ -497,14 +440,14 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; - __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); - __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); - __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); - __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); - __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); - __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); - __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); + __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_cmpeq_epi16(one, one); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -635,41 +578,48 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft); // lp filter - filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); + const __m128i pmin = _mm_subs_epi16(zero, t80); + + filt = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm_and_si128(filt, hev); work_a = _mm_subs_epi16(qs0, ps0); filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = signed_char_clamp_bd_sse2(filt, bd); + pixel_clamp(&pmin, &pmax, &filt); filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi16(filt, t4); filter2 = _mm_adds_epi16(filt, t3); // Filter1 >> 3 - filter1 = signed_char_clamp_bd_sse2(filter1, bd); + pixel_clamp(&pmin, &pmax, &filter1); filter1 = _mm_srai_epi16(filter1, 3); // Filter2 >> 3 - filter2 = signed_char_clamp_bd_sse2(filter2, bd); + pixel_clamp(&pmin, &pmax, &filter2); filter2 = _mm_srai_epi16(filter2, 3); // filt >> 1 filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); - // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; filt = _mm_andnot_si128(hev, filt); - work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd); + work_a = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); q0 = _mm_load_si128((__m128i *)flat_oq0); work_a = _mm_andnot_si128(flat, work_a); q0 = _mm_and_si128(flat, q0); q0 = _mm_or_si128(work_a, q0); - work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd); + work_a = _mm_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); q1 = _mm_load_si128((__m128i *)flat_oq1); work_a = _mm_andnot_si128(flat, work_a); @@ -682,14 +632,16 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, q2 = _mm_and_si128(flat, q2); q2 = _mm_or_si128(work_a, q2); - work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd); + work_a = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); p0 = _mm_load_si128((__m128i *)flat_op0); work_a = _mm_andnot_si128(flat, work_a); p0 = _mm_and_si128(flat, p0); p0 = _mm_or_si128(work_a, p0); - work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd); + work_a = _mm_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &work_a); work_a = _mm_adds_epi16(work_a, t80); p1 = _mm_load_si128((__m128i *)flat_op1); work_a = _mm_andnot_si128(flat, work_a); @@ -702,12 +654,7 @@ void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)(s - 3 * p), p2); - _mm_store_si128((__m128i *)(s - 2 * p), p1); - _mm_store_si128((__m128i *)(s - 1 * p), p0); - _mm_store_si128((__m128i *)(s + 0 * p), q0); - _mm_store_si128((__m128i *)(s + 1 * p), q1); - _mm_store_si128((__m128i *)(s + 2 * p), q2); + store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s); } void aom_highbd_lpf_horizontal_8_dual_sse2( @@ -725,14 +672,18 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, const __m128i zero = _mm_set1_epi16(0); __m128i blimit, limit, thresh; __m128i mask, hev, flat; +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); +#endif __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); +#endif const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); const __m128i abs_q1q0 = @@ -743,7 +694,7 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); - __m128i work; + const __m128i t4 = _mm_set1_epi16(4); const __m128i t3 = _mm_set1_epi16(3); __m128i t80; @@ -814,9 +765,9 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, // So taking maximums continues to work: mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); mask = _mm_max_epi16(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epi16( + +#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) + __m128i work = _mm_max_epi16( _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); mask = _mm_max_epi16(work, mask); @@ -824,22 +775,32 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); mask = _mm_max_epi16(work, mask); +#endif mask = _mm_subs_epu16(mask, limit); mask = _mm_cmpeq_epi16(mask, zero); // filter4 - filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); + const __m128i pmin = _mm_subs_epi16(zero, t80); + + filt = _mm_subs_epi16(ps1, qs1); + pixel_clamp(&pmin, &pmax, &filt); filt = _mm_and_si128(filt, hev); work_a = _mm_subs_epi16(qs0, ps0); filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); - filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + filt = _mm_adds_epi16(filt, work_a); + pixel_clamp(&pmin, &pmax, &filt); // (aom_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); - filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); - filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); + filter1 = _mm_adds_epi16(filt, t4); + pixel_clamp(&pmin, &pmax, &filter1); + + filter2 = _mm_adds_epi16(filt, t3); + pixel_clamp(&pmin, &pmax, &filter2); // Filter1 >> 3 work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 @@ -865,19 +826,32 @@ void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, filt = _mm_andnot_si128(hev, filt); - q0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); - q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), - t80); - p0 = _mm_adds_epi16( - signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); - p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), - t80); - + q0 = _mm_subs_epi16(qs0, filter1); + pixel_clamp(&pmin, &pmax, &q0); + q0 = _mm_adds_epi16(q0, t80); + + q1 = _mm_subs_epi16(qs1, filt); + pixel_clamp(&pmin, &pmax, &q1); + q1 = _mm_adds_epi16(q1, t80); + + p0 = _mm_adds_epi16(ps0, filter2); + pixel_clamp(&pmin, &pmax, &p0); + p0 = _mm_adds_epi16(p0, t80); + + p1 = _mm_adds_epi16(ps1, filt); + pixel_clamp(&pmin, &pmax, &p1); + p1 = _mm_adds_epi16(p1, t80); +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); +#else _mm_storeu_si128((__m128i *)(s - 2 * p), p1); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); _mm_storeu_si128((__m128i *)(s + 0 * p), q0); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +#endif } void aom_highbd_lpf_horizontal_4_dual_sse2( @@ -888,118 +862,6 @@ void aom_highbd_lpf_horizontal_4_dual_sse2( aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); } -static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], - int out_p, int num_8x8_to_transpose) { - int idx8x8 = 0; - __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; - do { - uint16_t *in = src[idx8x8]; - uint16_t *out = dst[idx8x8]; - - p0 = - _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 - p1 = - _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 - p2 = - _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 - p3 = - _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 - p4 = - _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 - p5 = - _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 - p6 = - _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 - p7 = - _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 - // 00 10 01 11 02 12 03 13 - x0 = _mm_unpacklo_epi16(p0, p1); - // 20 30 21 31 22 32 23 33 - x1 = _mm_unpacklo_epi16(p2, p3); - // 40 50 41 51 42 52 43 53 - x2 = _mm_unpacklo_epi16(p4, p5); - // 60 70 61 71 62 72 63 73 - x3 = _mm_unpacklo_epi16(p6, p7); - // 00 10 20 30 01 11 21 31 - x4 = _mm_unpacklo_epi32(x0, x1); - // 40 50 60 70 41 51 61 71 - x5 = _mm_unpacklo_epi32(x2, x3); - // 00 10 20 30 40 50 60 70 - x6 = _mm_unpacklo_epi64(x4, x5); - // 01 11 21 31 41 51 61 71 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); - // 00 10 20 30 40 50 60 70 - _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); - // 01 11 21 31 41 51 61 71 - - // 02 12 22 32 03 13 23 33 - x4 = _mm_unpackhi_epi32(x0, x1); - // 42 52 62 72 43 53 63 73 - x5 = _mm_unpackhi_epi32(x2, x3); - // 02 12 22 32 42 52 62 72 - x6 = _mm_unpacklo_epi64(x4, x5); - // 03 13 23 33 43 53 63 73 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); - // 02 12 22 32 42 52 62 72 - _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); - // 03 13 23 33 43 53 63 73 - - // 04 14 05 15 06 16 07 17 - x0 = _mm_unpackhi_epi16(p0, p1); - // 24 34 25 35 26 36 27 37 - x1 = _mm_unpackhi_epi16(p2, p3); - // 44 54 45 55 46 56 47 57 - x2 = _mm_unpackhi_epi16(p4, p5); - // 64 74 65 75 66 76 67 77 - x3 = _mm_unpackhi_epi16(p6, p7); - // 04 14 24 34 05 15 25 35 - x4 = _mm_unpacklo_epi32(x0, x1); - // 44 54 64 74 45 55 65 75 - x5 = _mm_unpacklo_epi32(x2, x3); - // 04 14 24 34 44 54 64 74 - x6 = _mm_unpacklo_epi64(x4, x5); - // 05 15 25 35 45 55 65 75 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); - // 04 14 24 34 44 54 64 74 - _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); - // 05 15 25 35 45 55 65 75 - - // 06 16 26 36 07 17 27 37 - x4 = _mm_unpackhi_epi32(x0, x1); - // 46 56 66 76 47 57 67 77 - x5 = _mm_unpackhi_epi32(x2, x3); - // 06 16 26 36 46 56 66 76 - x6 = _mm_unpacklo_epi64(x4, x5); - // 07 17 27 37 47 57 67 77 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); - // 06 16 26 36 46 56 66 76 - _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); - // 07 17 27 37 47 57 67 77 - } while (++idx8x8 < num_8x8_to_transpose); -} - -static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, - uint16_t *out, int out_p) { - uint16_t *src0[1]; - uint16_t *src1[1]; - uint16_t *dest0[1]; - uint16_t *dest1[1]; - src0[0] = in0; - src1[0] = in1; - dest0[0] = out; - dest1[0] = out + 8; - highbd_transpose(src0, in_p, dest0, out_p, 1); - highbd_transpose(src1, in_p, dest1, out_p, 1); -} - void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { @@ -1130,10 +992,12 @@ void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); - // Loop filtering +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); +#else aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); - +#endif // Transpose back highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm index 9c3bbdd69..855bc6558 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -293,4 +293,6 @@ HIGH_SADNXN4D 4, 16 HIGH_SADNXN4D 16, 4 HIGH_SADNXN4D 8, 32 HIGH_SADNXN4D 32, 8 +HIGH_SADNXN4D 16, 64 +HIGH_SADNXN4D 64, 16 %endif diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm index 248b98ef5..760e68aab 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm @@ -158,7 +158,10 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 - +%if CONFIG_EXT_PARTITION_TYPES +HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 +HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 +%endif ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -302,6 +305,8 @@ HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 %if CONFIG_EXT_PARTITION_TYPES HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 +HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 +HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 %endif ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c index 7bc8a0df3..befd81269 100644 --- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c @@ -177,177 +177,94 @@ static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride, _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7); } -static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 3; - src += src_stride << 3; - pred += pred_stride << 3; - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 8; - src += 8; - pred += 8; - subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 3; - src += src_stride << 3; - pred += pred_stride << 3; - subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 4; - src += src_stride << 4; - pred += pred_stride << 4; - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 16; - src += 16; - pred += 16; - subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 4; - src += src_stride << 4; - pred += pred_stride << 4; - subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 5; - src += src_stride << 5; - pred += pred_stride << 5; - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 32; - src += 32; - pred += 32; - subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 5; - src += src_stride << 5; - pred += pred_stride << 5; - subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 6; - src += src_stride << 6; - pred += pred_stride << 6; - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += 64; - src += 64; - pred += 64; - subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride); -} - -static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride, - const uint16_t *src, ptrdiff_t src_stride, - const uint16_t *pred, ptrdiff_t pred_stride) { - subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride); - diff += diff_stride << 6; - src += src_stride << 6; - pred += pred_stride << 6; - subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride); -} +#define STACK_V(h, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \ + pred + pred_stride * h, pred_stride); \ + } while (0) + +#define STACK_H(w, fun) \ + do { \ + fun(diff, diff_stride, src, src_stride, pred, pred_stride); \ + fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \ + } while (0) + +#define SUBTRACT_FUN(size) \ + static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \ + const uint16_t *src, ptrdiff_t src_stride, \ + const uint16_t *pred, ptrdiff_t pred_stride) + +SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); } +SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); } +SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); } +SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); } +SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); } +SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); } +SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); } +SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); } +SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); } +#if CONFIG_EXT_PARTITION +SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); } +SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); } +SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); } +#endif +SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); } +SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); } +SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); } +SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); } +SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); } +SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); } +#if CONFIG_EXT_PARTITION +SUBTRACT_FUN(32x128) { STACK_V(64, subtract_32x64); } +SUBTRACT_FUN(128x32) { STACK_H(64, subtract_64x32); } +#endif static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { - SubtractWxHFuncType ret_func_ptr = NULL; if (rows == 4) { - if (cols == 4) { - ret_func_ptr = subtract_4x4; - } else if (cols == 8) { - ret_func_ptr = subtract_8x4; - } - } else if (rows == 8) { - if (cols == 4) { - ret_func_ptr = subtract_4x8; - } else if (cols == 8) { - ret_func_ptr = subtract_8x8; - } else if (cols == 16) { - ret_func_ptr = subtract_16x8; - } - } else if (rows == 16) { - if (cols == 8) { - ret_func_ptr = subtract_8x16; - } else if (cols == 16) { - ret_func_ptr = subtract_16x16; - } else if (cols == 32) { - ret_func_ptr = subtract_32x16; - } - } else if (rows == 32) { - if (cols == 16) { - ret_func_ptr = subtract_16x32; - } else if (cols == 32) { - ret_func_ptr = subtract_32x32; - } else if (cols == 64) { - ret_func_ptr = subtract_64x32; - } - } else if (rows == 64) { - if (cols == 32) { - ret_func_ptr = subtract_32x64; - } else if (cols == 64) { - ret_func_ptr = subtract_64x64; - } else if (cols == 128) { - ret_func_ptr = subtract_128x64; - } - } else if (rows == 128) { - if (cols == 64) { - ret_func_ptr = subtract_64x128; - } else if (cols == 128) { - ret_func_ptr = subtract_128x128; - } + if (cols == 4) return subtract_4x4; + if (cols == 8) return subtract_8x4; + if (cols == 16) return subtract_16x4; + } + if (rows == 8) { + if (cols == 4) return subtract_4x8; + if (cols == 8) return subtract_8x8; + if (cols == 16) return subtract_16x8; + if (cols == 32) return subtract_32x8; + } + if (rows == 16) { + if (cols == 4) return subtract_4x16; + if (cols == 8) return subtract_8x16; + if (cols == 16) return subtract_16x16; + if (cols == 32) return subtract_32x16; + if (cols == 64) return subtract_64x16; + } + if (rows == 32) { + if (cols == 8) return subtract_8x32; + if (cols == 16) return subtract_16x32; + if (cols == 32) return subtract_32x32; + if (cols == 64) return subtract_64x32; +#if CONFIG_EXT_PARTITION + if (cols == 128) return subtract_128x32; +#endif // CONFIG_EXT_PARTITION + } + if (rows == 64) { + if (cols == 16) return subtract_16x64; + if (cols == 32) return subtract_32x64; + if (cols == 64) return subtract_64x64; +#if CONFIG_EXT_PARTITION + if (cols == 128) return subtract_128x64; +#endif // CONFIG_EXT_PARTITION } - if (!ret_func_ptr) { - assert(0); +#if CONFIG_EXT_PARTITION + if (rows == 128) { + if (cols == 32) return subtract_32x128; + if (cols == 64) return subtract_64x128; + if (cols == 128) return subtract_128x128; } - return ret_func_ptr; +#endif // CONFIG_EXT_PARTITION + assert(0); + return NULL; } void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff, diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index 93923ffb0..62acf3ed3 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -189,6 +189,8 @@ VAR_FN(8, 8, 8, 6); VAR_FN(16, 4, 16, 6); VAR_FN(8, 32, 8, 8); VAR_FN(32, 8, 16, 8); +VAR_FN(16, 64, 16, 10); +VAR_FN(64, 16, 16, 10); #endif #undef VAR_FN @@ -411,7 +413,9 @@ DECLS(sse2); FN(8, 4, 8, 3, 2, opt, (int64_t)); \ FN(16, 4, 16, 4, 2, opt, (int64_t)); \ FN(8, 32, 8, 3, 5, opt, (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (int64_t)) + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t)) #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ @@ -588,7 +592,9 @@ DECLS(sse2); FN(8, 4, 8, 3, 2, opt, (int64_t)); \ FN(16, 4, 16, 4, 2, opt, (int64_t)); \ FN(8, 32, 8, 3, 5, opt, (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (int64_t)); + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t)); #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c new file mode 100644 index 000000000..6b8922b8c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +static INLINE __m256i dc_sum_32(const uint8_t *ref) { + const __m256i x = _mm256_loadu_si256((const __m256i *)ref); + const __m256i zero = _mm256_setzero_si256(); + __m256i y = _mm256_sad_epu8(x, zero); + __m256i u = _mm256_permute2x128_si256(y, y, 1); + y = _mm256_add_epi64(u, y); + u = _mm256_unpackhi_epi64(y, y); + return _mm256_add_epi16(y, u); +} + +static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + dst += stride; + } +} + +void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum_left = _mm256_add_epi16(sum_left, thirtytwo); + sum_left = _mm256_srai_epi16(sum_left, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum_left, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 32, dst, stride); +} + +void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 32, dst, stride); +} + +// There are 32 rows togeter. This function does line: +// 0,1,2,3, and 16,17,18,19. The next call would do +// 4,5,6,7, and 20,21,22,23. So 4 times of calling +// would finish 32 rows. +static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst, + ptrdiff_t stride) { + __m256i t[4]; + __m256i m = _mm256_setzero_si256(); + const __m256i inc = _mm256_set1_epi8(4); + int i; + + for (i = 0; i < 4; i++) { + t[i] = _mm256_shuffle_epi8(*row, m); + __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0); + __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11); + _mm256_storeu_si256((__m256i *)dst, r0); + _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1); + dst += stride; + m = _mm256_add_epi8(m, inc); + } +} + +void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m256i left_col = _mm256_loadu_si256((__m256i const *)left); + + __m256i u = _mm256_unpacklo_epi8(left_col, left_col); + + __m256i v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + u = _mm256_unpackhi_epi8(left_col, left_col); + + v = _mm256_unpacklo_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); + dst += stride << 2; + + v = _mm256_unpackhi_epi8(u, u); + h_predictor_32x8line(&v, dst, stride); +} + +// ----------------------------------------------------------------------------- +// Rectangle + +// TODO(luoyi) The following two functions are shared with intrapred_sse2.c. +// Use a header file, intrapred_common_x86.h +static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i top_sum = dc_sum_32_sse2(above); + __m128i left_sum = dc_sum_16_sse2(left); + left_sum = _mm_add_epi16(top_sum, left_sum); + uint32_t sum = _mm_cvtsi128_si32(left_sum); + sum += 24; + sum /= 48; + + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 16, dst, stride); +} + +void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// TM_PRED + +// Return 16 16-bit pixels in one row (__m256i) +static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i base = + _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft); + + __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left)); + __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top)); + __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft)); + + __m256i mask1 = _mm256_cmpgt_epi16(pl, pt); + mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl)); + __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl); + + pl = _mm256_andnot_si256(mask1, *left); + + ptl = _mm256_and_si256(mask2, *topleft); + pt = _mm256_andnot_si256(mask2, *top); + pt = _mm256_or_si256(pt, ptl); + pt = _mm256_and_si256(mask1, pt); + + return _mm256_or_si256(pt, pl); +} + +// Return 16 8-bit pixels in one row (__m128i) +static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top, + const __m256i *topleft) { + const __m256i p0 = paeth_pred(left, top, topleft); + const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i p = _mm256_packus_epi16(p0, p1); + return _mm256_castsi256_si128(p); +} + +static INLINE __m256i get_top_vector(const uint8_t *above) { + const __m128i x = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t0 = _mm_unpacklo_epi8(x, zero); + const __m128i t1 = _mm_unpackhi_epi8(x, zero); + return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1); +} + +void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i x = _mm_loadl_epi64((const __m128i *)left); + const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 8; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +static INLINE __m256i get_left_vector(const uint8_t *left) { + const __m128i x = _mm_load_si128((const __m128i *)left); + return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); +} + +void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +// Return 32 8-bit pixels in one row (__m256i) +static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, + const __m256i *top1, + const __m256i *topleft) { + __m256i p0 = paeth_pred(left, top0, topleft); + __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x0 = _mm256_packus_epi16(p0, p1); + + p0 = paeth_pred(left, top1, topleft); + p1 = _mm256_permute4x64_epi64(p0, 0xe); + const __m256i x1 = _mm256_packus_epi16(p0, p1); + + return _mm256_permute2x128_si256(x0, x1, 0x20); +} + +void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl); + + _mm256_storeu_si256((__m256i *)dst, r); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m256i l = get_left_vector(left); + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + __m256i rep = _mm256_set1_epi16(0x8000); + const __m256i one = _mm256_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + + l = get_left_vector(left + 16); + rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm index 02567db49..9aece27be 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm @@ -623,149 +623,3 @@ cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left lea dstq, [dstq+strideq*4] jnz .loop REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_4x4, 4, 4, 5, dst, stride, above, left - pxor m1, m1 - movq m0, [aboveq-1]; [63:0] tl t1 t2 t3 t4 x x x - punpcklbw m0, m1 - pshuflw m2, m0, 0x0 ; [63:0] tl tl tl tl [word] - psrldq m0, 2 - psubw m0, m2 ; [63:0] t1-tl t2-tl t3-tl t4-tl [word] - movd m2, [leftq] - punpcklbw m2, m1 - pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] - pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] - paddw m4, m0 - paddw m3, m0 - packuswb m4, m4 - packuswb m3, m3 - movd [dstq ], m4 - movd [dstq+strideq], m3 - lea dstq, [dstq+strideq*2] - pshuflw m4, m2, 0xaa - pshuflw m3, m2, 0xff - paddw m4, m0 - paddw m3, m0 - packuswb m4, m4 - packuswb m3, m3 - movd [dstq ], m4 - movd [dstq+strideq], m3 - RET - -INIT_XMM sse2 -cglobal tm_predictor_8x8, 4, 4, 5, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - movq m0, [aboveq] - punpcklbw m2, m1 - punpcklbw m0, m1 ; t1 t2 t3 t4 t5 t6 t7 t8 [word] - pshuflw m2, m2, 0x0 ; [63:0] tl tl tl tl [word] - DEFINE_ARGS dst, stride, line, left - mov lineq, -4 - punpcklqdq m2, m2 ; tl tl tl tl tl tl tl tl [word] - psubw m0, m2 ; t1-tl t2-tl ... t8-tl [word] - movq m2, [leftq] - punpcklbw m2, m1 ; l1 l2 l3 l4 l5 l6 l7 l8 [word] -.loop: - pshuflw m4, m2, 0x0 ; [63:0] l1 l1 l1 l1 [word] - pshuflw m3, m2, 0x55 ; [63:0] l2 l2 l2 l2 [word] - punpcklqdq m4, m4 ; l1 l1 l1 l1 l1 l1 l1 l1 [word] - punpcklqdq m3, m3 ; l2 l2 l2 l2 l2 l2 l2 l2 [word] - paddw m4, m0 - paddw m3, m0 - packuswb m4, m3 - movq [dstq ], m4 - movhps [dstq+strideq], m4 - lea dstq, [dstq+strideq*2] - psrldq m2, 4 - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_16x16, 4, 5, 8, dst, stride, above, left - pxor m1, m1 - mova m2, [aboveq-16]; - mova m0, [aboveq] ; t1 t2 ... t16 [byte] - punpckhbw m2, m1 ; [127:112] tl [word] - punpckhbw m4, m0, m1 - punpcklbw m0, m1 ; m0:m4 t1 t2 ... t16 [word] - DEFINE_ARGS dst, stride, line, left, stride8 - mov lineq, -8 - pshufhw m2, m2, 0xff - mova m3, [leftq] ; l1 l2 ... l16 [byte] - punpckhqdq m2, m2 ; tl repeated 8 times [word] - psubw m0, m2 - psubw m4, m2 ; m0:m4 t1-tl t2-tl ... t16-tl [word] - punpckhbw m5, m3, m1 - punpcklbw m3, m1 ; m3:m5 l1 l2 ... l16 [word] - lea stride8q, [strideq*8] -.loop: - pshuflw m6, m3, 0x0 - pshuflw m7, m5, 0x0 - punpcklqdq m6, m6 ; l1 repeated 8 times [word] - punpcklqdq m7, m7 ; l8 repeated 8 times [word] - paddw m1, m6, m0 - paddw m6, m4 ; m1:m6 ti-tl+l1 [i=1,15] [word] - psrldq m5, 2 - packuswb m1, m6 - mova [dstq ], m1 - paddw m1, m7, m0 - paddw m7, m4 ; m1:m7 ti-tl+l8 [i=1,15] [word] - psrldq m3, 2 - packuswb m1, m7 - mova [dstq+stride8q], m1 - inc lineq - lea dstq, [dstq+strideq] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_32x32, 4, 4, 8, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - mova m0, [aboveq] - mova m4, [aboveq+16] - punpcklbw m2, m1 - punpckhbw m3, m0, m1 - punpckhbw m5, m4, m1 - punpcklbw m0, m1 - punpcklbw m4, m1 - pshuflw m2, m2, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -16 - punpcklqdq m2, m2 - add leftq, 32 - psubw m0, m2 - psubw m3, m2 - psubw m4, m2 - psubw m5, m2 -.loop: - movd m2, [leftq+lineq*2] - pxor m1, m1 - punpcklbw m2, m1 - pshuflw m7, m2, 0x55 - pshuflw m2, m2, 0x0 - punpcklqdq m2, m2 - punpcklqdq m7, m7 - paddw m6, m2, m3 - paddw m1, m2, m0 - packuswb m1, m6 - mova [dstq ], m1 - paddw m6, m2, m5 - paddw m1, m2, m4 - packuswb m1, m6 - mova [dstq+16 ], m1 - paddw m6, m7, m3 - paddw m1, m7, m0 - packuswb m1, m6 - mova [dstq+strideq ], m1 - paddw m6, m7, m5 - paddw m1, m7, m4 - packuswb m1, m6 - mova [dstq+strideq+16], m1 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c new file mode 100644 index 000000000..2a83b9001 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c @@ -0,0 +1,684 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" + +static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) { + int i; + for (i = 0; i < 4; ++i) { + *(uint32_t *)dst = dc; + dst += stride; + *(uint32_t *)dst = dc; + dst += stride; + } +} + +static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_storel_epi64((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + dst += stride; + } +} + +static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + dst += stride; + } +} + +static INLINE __m128i dc_sum_4(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_unpacklo_epi8(x, zero); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_8(const uint8_t *ref) { + __m128i x = _mm_loadl_epi64((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + return _mm_sad_epu8(x, zero); +} + +static INLINE __m128i dc_sum_16(const uint8_t *ref) { + __m128i x = _mm_load_si128((__m128i const *)ref); + const __m128i zero = _mm_setzero_si128(); + x = _mm_sad_epu8(x, zero); + const __m128i high = _mm_unpackhi_epi64(x, x); + return _mm_add_epi16(x, high); +} + +static INLINE __m128i dc_sum_32(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x0 = _mm_add_epi16(x0, x1); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +// ----------------------------------------------------------------------------- +// DC_PRED + +void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 6; + sum /= 12; + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + const uint32_t pred = _mm_cvtsi128_si32(row); + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 6; + sum /= 12; + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 12; + sum /= 24; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_8(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 12; + sum /= 24; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 24; + sum /= 48; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_16(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 24; + sum /= 48; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_TOP + +void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16((int16_t)2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = _mm_cvtsi128_si32(sum_above); + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_LEFT + +void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = _mm_cvtsi128_si32(sum_left); + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16((uint16_t)2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4x8(pred, dst, stride); +} + +void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// V_PRED + +void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4x8(pred, dst, stride); +} + +void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 4, dst, stride); +} + +void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 16, dst, stride); +} + +void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 8, dst, stride); +} + +void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 32, dst, stride); +} + +void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + (void)left; + int i; + for (i = 0; i < 16; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + dst += stride; + } +} + +// ----------------------------------------------------------------------------- +// H_PRED + +void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); + dst += stride; + left_col = _mm_unpackhi_epi64(left_col, left_col); + row0 = _mm_shufflelo_epi16(left_col, 0); + row1 = _mm_shufflelo_epi16(left_col, 0x55); + row2 = _mm_shufflelo_epi16(left_col, 0xaa); + row3 = _mm_shufflelo_epi16(left_col, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + dst += stride; + } +} + +static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflelo_epi16(*x, 0); + const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff); + + row[0] = _mm_unpacklo_epi64(u0, u0); + row[1] = _mm_unpacklo_epi64(u1, u1); + row[2] = _mm_unpacklo_epi64(u2, u2); + row[3] = _mm_unpacklo_epi64(u3, u3); +} + +static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) { + const __m128i u0 = _mm_shufflehi_epi16(*x, 0); + const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55); + const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa); + const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff); + + row[0] = _mm_unpackhi_epi64(u0, u0); + row[1] = _mm_unpackhi_epi64(u1, u1); + row[2] = _mm_unpackhi_epi64(u2, u2); + row[3] = _mm_unpackhi_epi64(u3, u3); +} + +// Process 16x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +// Process 16x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_16xh(row, 4, dst, stride); +} + +void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); +} + +void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + int i = 0; + + do { + left_col = _mm_load_si128((const __m128i *)left); + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left_col_8p = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_16x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left += 16; + i++; + } while (i < 2); +} + +static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, + ptrdiff_t stride) { + int i; + for (i = 0; i < h; ++i) { + _mm_store_si128((__m128i *)dst, row[i]); + _mm_store_si128((__m128i *)(dst + 16), row[i]); + dst += stride; + } +} + +// Process 32x8, first 4 rows +// Use first 8 bytes of left register: xxxxxxxx33221100 +static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_low_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +// Process 32x8, second 4 rows +// Use second 8 bytes of left register: 77665544xxxxxxxx +static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, + ptrdiff_t stride) { + __m128i row[4]; + repeat_high_4pixels(left, row); + h_pred_store_32xh(row, 4, dst, stride); +} + +void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); + dst += stride << 2; + + left_col_8p = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c new file mode 100644 index 000000000..85b82744e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c @@ -0,0 +1,885 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "./aom_dsp_rtcd.h" +#include "aom_dsp/intrapred_common.h" + +// ----------------------------------------------------------------------------- +// TM_PRED + +// Return 8 16-bit pixels in one row +static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, + const __m128i *topleft) { + const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft); + + __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left)); + __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top)); + __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft)); + + __m128i mask1 = _mm_cmpgt_epi16(pl, pt); + mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl)); + __m128i mask2 = _mm_cmpgt_epi16(pt, ptl); + + pl = _mm_andnot_si128(mask1, *left); + + ptl = _mm_and_si128(mask2, *topleft); + pt = _mm_andnot_si128(mask2, *top); + pt = _mm_or_si128(pt, ptl); + pt = _mm_and_si128(mask1, pt); + + return _mm_or_si128(pl, pt); +} + +void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +// Return 16 8-bit pixels in one row +static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, + const __m128i *top1, + const __m128i *topleft) { + const __m128i p0 = paeth_8x1_pred(left, top0, topleft); + const __m128i p1 = paeth_8x1_pred(left, top1, topleft); + return _mm_packus_epi16(p0, p1); +} + +void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_loadl_epi64((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 8; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + int i; + for (i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + l = _mm_load_si128((const __m128i *)(left + 16)); + rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + __m128i l = _mm_load_si128((const __m128i *)left); + __m128i l16; + + int i; + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + + rep = _mm_set1_epi16(0x8000); + l = _mm_load_si128((const __m128i *)(left + 16)); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + +// ----------------------------------------------------------------------------- +// SMOOTH_PRED + +// pixels[0]: above and below_pred interleave vector +// pixels[1]: left vector +// pixels[2]: right_pred vector +static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i d = _mm_loadl_epi64((const __m128i *)above); + pixels[2] = _mm_set1_epi16((uint16_t)above[3]); + pixels[1] = _mm_loadl_epi64((const __m128i *)left); + + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); +} + +// weights[0]: weights_h vector +// weights[1]: scale - weights_h vecotr +// weights[2]: weights_w and scale - weights_w interleave vector +static INLINE void load_weight_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]); + const __m128i zero = _mm_setzero_si128(); + + weights[0] = _mm_unpacklo_epi8(t, zero); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + weights[1] = _mm_sub_epi16(d, weights[0]); + weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]); + + if (height == 8) { + t = _mm_srli_si128(t, 4); + weights[0] = _mm_unpacklo_epi8(t, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } +} + +static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight, + int h, uint8_t *dst, ptrdiff_t stride) { + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i rep = _mm_set1_epi16(0x8000); + __m128i d = _mm_set1_epi16(0x100); + + int i; + for (i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s = _mm_madd_epi16(pixel[0], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixel[1], rep); + b = _mm_unpacklo_epi16(b, pixel[2]); + __m128i sum = _mm_madd_epi16(b, weight[2]); + + sum = _mm_add_epi32(s, sum); + sum = _mm_add_epi32(sum, round); + sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale); + + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 4, pixels); + + __m128i weights[3]; + load_weight_w4(sm_weight_arrays, 4, weights); + + smooth_pred_4xh(pixels, weights, 4, dst, stride); +} + +void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 8, pixels); + + __m128i weights[3]; + load_weight_w4(sm_weight_arrays, 8, weights); + + smooth_pred_4xh(pixels, weights, 8, dst, stride); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +// pixels[2]: left vector +// pixels[3]: right_pred vector +static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i d = _mm_loadl_epi64((const __m128i *)above); + pixels[3] = _mm_set1_epi16((uint16_t)above[7]); + pixels[2] = _mm_load_si128((const __m128i *)left); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); + pixels[1] = _mm_unpackhi_epi16(d, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +static INLINE void load_weight_w8(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + const int we_offset = height < 8 ? 4 : 8; + __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + + if (height == 4) { + we = _mm_srli_si128(we, 4); + __m128i tmp1 = _mm_unpacklo_epi8(we, zero); + __m128i tmp2 = _mm_sub_epi16(d, tmp1); + weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); + weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); + } else { + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + } + + if (height == 16) { + we = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(we, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(we, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } +} + +static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + + __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); + __m128i d = _mm_set1_epi16(0x100); + + int i; + for (i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + + __m128i b = _mm_shuffle_epi8(pixels[2], rep); + b = _mm_unpacklo_epi16(b, pixels[3]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + + s0 = _mm_add_epi32(s0, sum0); + s0 = _mm_add_epi32(s0, round); + s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); + + s1 = _mm_add_epi32(s1, sum1); + s1 = _mm_add_epi32(s1, round); + s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); + + sum0 = _mm_packus_epi16(s0, s1); + sum0 = _mm_shuffle_epi8(sum0, gat); + _mm_storel_epi64((__m128i *)dst, sum0); + dst += stride; + + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 4, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 4, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0); +} + +void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 8, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 8, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[4]; + load_pixel_w8(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w8(sm_weight_arrays, 16, wh, ww); + + smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); +} + +// pixels[0]: above and below_pred interleave vector, 1/4 +// pixels[1]: above and below_pred interleave vector, 2/4 +// pixels[2]: above and below_pred interleave vector, 3/4 +// pixels[3]: above and below_pred interleave vector, 3/4 +// pixels[4]: left vector +// pixels[5]: left vector, h = 32 only +// pixels[6]: right_pred vector +static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i ab = _mm_load_si128((const __m128i *)above); + pixels[6] = _mm_set1_epi16((uint16_t)above[15]); + pixels[4] = _mm_load_si128((const __m128i *)left); + pixels[5] = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(ab, zero); + pixels[0] = _mm_unpacklo_epi16(x, bp); + pixels[1] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpackhi_epi8(ab, zero); + pixels[2] = _mm_unpacklo_epi16(x, bp); + pixels[3] = _mm_unpackhi_epi16(x, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// ... ... +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +// ... ... +static INLINE void load_weight_w16(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]); + __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]); + __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]); + __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height == 8) { + weight_h[0] = _mm_unpacklo_epi8(w8, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); // scale - weight_h + + __m128i x = _mm_unpacklo_epi8(w16, zero); + __m128i y = _mm_sub_epi16(d, x); + weight_w[0] = _mm_unpacklo_epi16(x, y); + weight_w[1] = _mm_unpackhi_epi16(x, y); + x = _mm_unpackhi_epi8(w16, zero); + y = _mm_sub_epi16(d, x); + weight_w[2] = _mm_unpacklo_epi16(x, y); + weight_w[3] = _mm_unpackhi_epi16(x, y); + } + + if (height == 16) { + weight_h[0] = _mm_unpacklo_epi8(w16, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w16, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]); + weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]); + } + + if (height == 32) { + weight_h[0] = _mm_unpacklo_epi8(w32_0, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w32_0, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + __m128i x = _mm_unpacklo_epi8(w16, zero); + __m128i y = _mm_sub_epi16(d, x); + weight_w[0] = _mm_unpacklo_epi16(x, y); + weight_w[1] = _mm_unpackhi_epi16(x, y); + x = _mm_unpackhi_epi8(w16, zero); + y = _mm_sub_epi16(d, x); + weight_w[2] = _mm_unpacklo_epi16(x, y); + weight_w[3] = _mm_unpackhi_epi16(x, y); + + weight_h[4] = _mm_unpacklo_epi8(w32_1, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(w32_1, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + } +} + +static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, uint8_t *dst, + ptrdiff_t stride, int quarter) { + __m128i d = _mm_set1_epi16(0x100); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + __m128i rep = + (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008); + const __m128i left = (quarter < 2) ? pixels[4] : pixels[5]; + + int i; + for (i = 0; i < 8; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); + __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); + __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc); + __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc); + + __m128i b = _mm_shuffle_epi8(left, rep); + b = _mm_unpacklo_epi16(b, pixels[6]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); + __m128i sum2 = _mm_madd_epi16(b, ww[2]); + __m128i sum3 = _mm_madd_epi16(b, ww[3]); + + s0 = _mm_add_epi32(s0, sum0); + s0 = _mm_add_epi32(s0, round); + s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); + + s1 = _mm_add_epi32(s1, sum1); + s1 = _mm_add_epi32(s1, round); + s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); + + s2 = _mm_add_epi32(s2, sum2); + s2 = _mm_add_epi32(s2, round); + s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale); + + s3 = _mm_add_epi32(s3, sum3); + s3 = _mm_add_epi32(s3, round); + s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale); + + sum0 = _mm_packus_epi16(s0, s1); + sum0 = _mm_shuffle_epi8(sum0, gat); + sum1 = _mm_packus_epi16(s2, s3); + sum1 = _mm_shuffle_epi8(sum1, gat); + + _mm_storel_epi64((__m128i *)dst, sum0); + _mm_storel_epi64((__m128i *)(dst + 8), sum1); + + dst += stride; + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[7]; + load_pixel_w16(above, left, 8, pixels); + + __m128i wh[2], ww[4]; + load_weight_w16(sm_weight_arrays, 8, wh, ww); + + smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); +} + +void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[7]; + load_pixel_w16(above, left, 16, pixels); + + __m128i wh[4], ww[4]; + load_weight_w16(sm_weight_arrays, 16, wh, ww); + + smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1); +} + +void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[7]; + load_pixel_w16(above, left, 32, pixels); + + __m128i wh[8], ww[4]; + load_weight_w16(sm_weight_arrays, 32, wh, ww); + + smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2); + dst += stride << 3; + smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3); +} + +static INLINE void load_pixel_w32(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + __m128i ab0 = _mm_load_si128((const __m128i *)above); + __m128i ab1 = _mm_load_si128((const __m128i *)(above + 16)); + + pixels[10] = _mm_set1_epi16((uint16_t)above[31]); + pixels[8] = _mm_load_si128((const __m128i *)left); + pixels[9] = _mm_load_si128((const __m128i *)(left + 16)); + + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + const __m128i zero = _mm_setzero_si128(); + + __m128i x = _mm_unpacklo_epi8(ab0, zero); + pixels[0] = _mm_unpacklo_epi16(x, bp); + pixels[1] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpackhi_epi8(ab0, zero); + pixels[2] = _mm_unpacklo_epi16(x, bp); + pixels[3] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpacklo_epi8(ab1, zero); + pixels[4] = _mm_unpacklo_epi16(x, bp); + pixels[5] = _mm_unpackhi_epi16(x, bp); + + x = _mm_unpackhi_epi8(ab1, zero); + pixels[6] = _mm_unpacklo_epi16(x, bp); + pixels[7] = _mm_unpackhi_epi16(x, bp); +} + +static INLINE void load_weight_w32(const uint8_t *weight_array, int height, + __m128i *weight_h, __m128i *weight_w) { + const __m128i zero = _mm_setzero_si128(); + __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]); + __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]); + __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height == 16) { + weight_h[0] = _mm_unpacklo_epi8(w16, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w16, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + __m128i x = _mm_unpacklo_epi8(w32_0, zero); + __m128i y = _mm_sub_epi16(d, x); + weight_w[0] = _mm_unpacklo_epi16(x, y); + weight_w[1] = _mm_unpackhi_epi16(x, y); + + x = _mm_unpackhi_epi8(w32_0, zero); + y = _mm_sub_epi16(d, x); + weight_w[2] = _mm_unpacklo_epi16(x, y); + weight_w[3] = _mm_unpackhi_epi16(x, y); + + x = _mm_unpacklo_epi8(w32_1, zero); + y = _mm_sub_epi16(d, x); + weight_w[4] = _mm_unpacklo_epi16(x, y); + weight_w[5] = _mm_unpackhi_epi16(x, y); + + x = _mm_unpackhi_epi8(w32_1, zero); + y = _mm_sub_epi16(d, x); + weight_w[6] = _mm_unpacklo_epi16(x, y); + weight_w[7] = _mm_unpackhi_epi16(x, y); + } + + if (height == 32) { + weight_h[0] = _mm_unpacklo_epi8(w32_0, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(w32_0, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + + weight_h[4] = _mm_unpacklo_epi8(w32_1, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(w32_1, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); + weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); + weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]); + weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]); + + weight_w[4] = _mm_unpacklo_epi16(weight_h[4], weight_h[5]); + weight_w[5] = _mm_unpackhi_epi16(weight_h[4], weight_h[5]); + weight_w[6] = _mm_unpacklo_epi16(weight_h[6], weight_h[7]); + weight_w[7] = _mm_unpackhi_epi16(weight_h[6], weight_h[7]); + } +} + +static INLINE void smooth_pred_32x8(const __m128i *pixels, const __m128i *wh, + const __m128i *ww, uint8_t *dst, + ptrdiff_t stride, int quarter) { + __m128i d = _mm_set1_epi16(0x100); + const __m128i one = _mm_set1_epi16(1); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); + __m128i rep = + (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008); + const __m128i left = (quarter < 2) ? pixels[8] : pixels[9]; + + int i; + for (i = 0; i < 8; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + + int j; + __m128i s[8]; + __m128i b = _mm_shuffle_epi8(left, rep); + b = _mm_unpacklo_epi16(b, pixels[10]); + + for (j = 0; j < 8; ++j) { + s[j] = _mm_madd_epi16(pixels[j], wh_sc); + s[j] = _mm_add_epi32(s[j], _mm_madd_epi16(b, ww[j])); + s[j] = _mm_add_epi32(s[j], round); + s[j] = _mm_srai_epi32(s[j], 1 + sm_weight_log2_scale); + } + + for (j = 0; j < 8; j += 2) { + __m128i sum = _mm_packus_epi16(s[j], s[j + 1]); + sum = _mm_shuffle_epi8(sum, gat); + _mm_storel_epi64((__m128i *)(dst + (j << 2)), sum); + } + dst += stride; + rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[11]; + load_pixel_w32(above, left, 16, pixels); + + __m128i wh[4], ww[8]; + load_weight_w32(sm_weight_arrays, 16, wh, ww); + + smooth_pred_32x8(pixels, wh, ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1); +} + +void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[11]; + load_pixel_w32(above, left, 32, pixels); + + __m128i wh[8], ww[8]; + load_weight_w32(sm_weight_arrays, 32, wh, ww); + + smooth_pred_32x8(pixels, &wh[0], ww, dst, stride, 0); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[4], ww, dst, stride, 2); + dst += stride << 3; + smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3); +} diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h index 4238e651b..26c5cfe59 100644 --- a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h +++ b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h @@ -18,17 +18,17 @@ #include "aom_dsp/x86/txfm_common_avx2.h" static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) { -#if CONFIG_HIGHBITDEPTH - *in = _mm256_setr_epi16( - (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], - (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], - (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], - (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], - (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], - (int16_t)coeff[15]); -#else - *in = _mm256_loadu_si256((const __m256i *)coeff); -#endif + if (sizeof(tran_low_t) == 4) { + *in = _mm256_setr_epi16( + (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], + (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], + (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], + (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], + (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], + (int16_t)coeff[15]); + } else { + *in = _mm256_loadu_si256((const __m256i *)coeff); + } } static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) { diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h index 95d246c3c..342816977 100644 --- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h @@ -133,12 +133,12 @@ static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { // Function to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled static INLINE __m128i load_input_data(const tran_low_t *data) { -#if CONFIG_HIGHBITDEPTH - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); -#else - return _mm_load_si128((const __m128i *)data); -#endif + if (sizeof(tran_low_t) == 4) { + return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], + data[6], data[7]); + } else { + return _mm_load_si128((const __m128i *)data); + } } static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c index 7e134dc63..8343dbbed 100644 --- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -178,10 +178,20 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, #endif // !CONFIG_PARALLEL_DEBLOCKING FILTER4; +#if CONFIG_PARALLEL_DEBLOCKING + *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0); + ps1ps0 = _mm_srli_si128(ps1ps0, 8); + *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0); + + *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0); + qs1qs0 = _mm_srli_si128(qs1qs0, 8); + *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0); +#else _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 +#endif } void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, @@ -267,8 +277,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); +#if !CONFIG_PARALLEL_DEBLOCKING // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); +#endif // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); @@ -279,7 +291,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); ps1ps0 = _mm_srli_si128(ps1ps0, 4); *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - +#if !CONFIG_PARALLEL_DEBLOCKING *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); qs1qs0 = _mm_srli_si128(qs1qs0, 4); *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); @@ -287,6 +299,19 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); qs1qs0 = _mm_srli_si128(qs1qs0, 4); *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); +#endif +} + +static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num, + uint8_t *s) { +#if CONFIG_PARALLEL_DEBLOCKING + *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x); + const __m128i hi = _mm_srli_si128(*x, 8); + *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi); +#else + _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x); + _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x)); +#endif } void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, @@ -580,44 +605,37 @@ void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); + store_buffer_horz_8(&q6p6, p, 6, s); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); + store_buffer_horz_8(&q5p5, p, 5, s); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); + store_buffer_horz_8(&q4p4, p, 4, s); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); + store_buffer_horz_8(&q3p3, p, 3, s); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); + store_buffer_horz_8(&q2p2, p, 2, s); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); + store_buffer_horz_8(&q1p1, p, 1, s); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); + store_buffer_horz_8(&q0p0, p, 0, s); } } @@ -651,10 +669,33 @@ static INLINE __m128i filter16_mask(const __m128i *const flat, return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); } -void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput; + +static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x, + int p, int offset, uint8_t *s) { + int i; + if (pixel_num == FOUR_PIXELS) { + for (i = 13; i >= 0; i--) { + *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]); + } + } + if (pixel_num == EIGHT_PIXELS) { + for (i = 13; i >= 0; i--) { + _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]); + } + } + if (pixel_num == SIXTEEN_PIXELS) { + for (i = 13; i >= 0; i--) { + _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]); + } + } +} + +static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num, + unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -910,73 +951,62 @@ void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); - p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); + __m128i x[14]; + x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); - p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); + x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); - p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); + x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); - p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); + x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); - op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 3 * p), op2); + x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); - op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 2 * p), op1); + x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); - op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 1 * p), op0); + x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); - oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); + x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); - oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); + x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); - oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); + x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); - q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); + x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); - q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); + x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); - q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); + x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi); f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); - q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); + x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi); + + store_buffer_horz_16(pixel_num, x, p, 6, s); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1186,15 +1216,35 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); +#if CONFIG_PARALLEL_DEBLOCKING + *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2); + *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1); + *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0); + *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0); + *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1); + *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2); +#else _mm_storel_epi64((__m128i *)(s - 3 * p), p2); _mm_storel_epi64((__m128i *)(s - 2 * p), p1); _mm_storel_epi64((__m128i *)(s - 1 * p), p0); _mm_storel_epi64((__m128i *)(s + 0 * p), q0); _mm_storel_epi64((__m128i *)(s + 1 * p), q1); _mm_storel_epi64((__m128i *)(s + 2 * p), q2); +#endif } } +void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { +#if CONFIG_PARALLEL_DEBLOCKING + lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh); +#else + lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh); +#endif +} + void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h new file mode 100644 index 000000000..027c890dc --- /dev/null +++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H +#define _AOM_DSP_X86_LPF_COMMON_X86_H + +#include // SSE2 + +#include "./aom_config.h" + +static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], + int out_p, int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; + do { + uint16_t *in = src[idx8x8]; + uint16_t *out = dst[idx8x8]; + + p0 = + _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 + p1 = + _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 + p2 = + _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 + p3 = + _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 + p4 = + _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 + p5 = + _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 + p6 = + _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 + p7 = + _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 + x0 = _mm_unpacklo_epi16(p0, p1); + // 20 30 21 31 22 32 23 33 + x1 = _mm_unpacklo_epi16(p2, p3); + // 40 50 41 51 42 52 43 53 + x2 = _mm_unpacklo_epi16(p4, p5); + // 60 70 61 71 62 72 63 73 + x3 = _mm_unpacklo_epi16(p6, p7); + // 00 10 20 30 01 11 21 31 + x4 = _mm_unpacklo_epi32(x0, x1); + // 40 50 60 70 41 51 61 71 + x5 = _mm_unpacklo_epi32(x2, x3); + // 00 10 20 30 40 50 60 70 + x6 = _mm_unpacklo_epi64(x4, x5); + // 01 11 21 31 41 51 61 71 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); + // 00 10 20 30 40 50 60 70 + _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); + // 01 11 21 31 41 51 61 71 + + // 02 12 22 32 03 13 23 33 + x4 = _mm_unpackhi_epi32(x0, x1); + // 42 52 62 72 43 53 63 73 + x5 = _mm_unpackhi_epi32(x2, x3); + // 02 12 22 32 42 52 62 72 + x6 = _mm_unpacklo_epi64(x4, x5); + // 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); + // 02 12 22 32 42 52 62 72 + _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); + // 03 13 23 33 43 53 63 73 + + // 04 14 05 15 06 16 07 17 + x0 = _mm_unpackhi_epi16(p0, p1); + // 24 34 25 35 26 36 27 37 + x1 = _mm_unpackhi_epi16(p2, p3); + // 44 54 45 55 46 56 47 57 + x2 = _mm_unpackhi_epi16(p4, p5); + // 64 74 65 75 66 76 67 77 + x3 = _mm_unpackhi_epi16(p6, p7); + // 04 14 24 34 05 15 25 35 + x4 = _mm_unpacklo_epi32(x0, x1); + // 44 54 64 74 45 55 65 75 + x5 = _mm_unpacklo_epi32(x2, x3); + // 04 14 24 34 44 54 64 74 + x6 = _mm_unpacklo_epi64(x4, x5); + // 05 15 25 35 45 55 65 75 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); + // 04 14 24 34 44 54 64 74 + _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); + // 05 15 25 35 45 55 65 75 + + // 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi32(x0, x1); + // 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi32(x2, x3); + // 06 16 26 36 46 56 66 76 + x6 = _mm_unpacklo_epi64(x4, x5); + // 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi64(x4, x5); + + _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); + // 06 16 26 36 46 56 66 76 + _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); + // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, + uint16_t *out, int out_p) { + uint16_t *src0[1]; + uint16_t *src1[1]; + uint16_t *dest0[1]; + uint16_t *dest1[1]; + src0[0] = in0; + src1[0] = in1; + dest0[0] = out; + dest1[0] = out + 8; + highbd_transpose(src0, in_p, dest0, out_p, 1); + highbd_transpose(src1, in_p, dest1, out_p, 1); +} +#endif // _AOM_DSP_X86_LPF_COMMON_X86_H diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c index 6a73ac460..2536f91d2 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -98,7 +98,13 @@ MASKSAD4XN_SSSE3(16) MASKSADMXN_SSSE3(16, 4) MASKSAD8XN_SSSE3(32) MASKSADMXN_SSSE3(32, 8) -#endif +MASKSADMXN_SSSE3(16, 64) +MASKSADMXN_SSSE3(64, 16) +#if CONFIG_EXT_PARTITION +MASKSADMXN_SSSE3(32, 128) +MASKSADMXN_SSSE3(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, @@ -294,7 +300,13 @@ HIGHBD_MASKSAD4XN_SSSE3(16) HIGHBD_MASKSADMXN_SSSE3(16, 4) HIGHBD_MASKSADMXN_SSSE3(8, 32) HIGHBD_MASKSADMXN_SSSE3(32, 8) -#endif +HIGHBD_MASKSADMXN_SSSE3(16, 64) +HIGHBD_MASKSADMXN_SSSE3(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_MASKSADMXN_SSSE3(32, 128) +HIGHBD_MASKSADMXN_SSSE3(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c index 24e7ed1c6..3ffe132be 100644 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -131,7 +131,13 @@ MASK_SUBPIX_VAR4XH_SSSE3(16) MASK_SUBPIX_VAR_SSSE3(16, 4) MASK_SUBPIX_VAR8XH_SSSE3(32) MASK_SUBPIX_VAR_SSSE3(32, 8) -#endif +MASK_SUBPIX_VAR_SSSE3(64, 16) +MASK_SUBPIX_VAR_SSSE3(16, 64) +#if CONFIG_EXT_PARTITION +MASK_SUBPIX_VAR_SSSE3(128, 32) +MASK_SUBPIX_VAR_SSSE3(32, 128) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES static INLINE __m128i filter_block(const __m128i a, const __m128i b, const __m128i filter) { @@ -712,6 +718,12 @@ HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16) +#if CONFIG_EXT_PARTITION +HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 128) +HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 32) +#endif #endif static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c index 3fd6f71e5..52dd508ec 100644 --- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c @@ -142,6 +142,8 @@ OBMCSADWXH(4, 16) OBMCSADWXH(16, 4) OBMCSADWXH(8, 32) OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) #endif //////////////////////////////////////////////////////////////////////////////// @@ -271,5 +273,7 @@ HBD_OBMCSADWXH(4, 16) HBD_OBMCSADWXH(16, 4) HBD_OBMCSADWXH(8, 32) HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) #endif #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 44cfa8e28..392616af3 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -151,7 +151,13 @@ OBMCVARWXH(4, 16) OBMCVARWXH(16, 4) OBMCVARWXH(8, 32) OBMCVARWXH(32, 8) -#endif +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) +#if CONFIG_EXT_PARTITION +OBMCVARWXH(32, 128) +OBMCVARWXH(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES //////////////////////////////////////////////////////////////////////////////// // High bit-depth @@ -364,5 +370,11 @@ HBD_OBMCVARWXH(4, 16) HBD_OBMCVARWXH(16, 4) HBD_OBMCVARWXH(8, 32) HBD_OBMCVARWXH(32, 8) -#endif +HBD_OBMCVARWXH(16, 64) +HBD_OBMCVARWXH(64, 16) +#if CONFIG_EXT_PARTITION +HBD_OBMCVARWXH(32, 128) +HBD_OBMCVARWXH(128, 32) +#endif // CONFIG_EXT_PARTITION +#endif // CONFIG_EXT_PARTITION_TYPES #endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c index 890c1f01e..0e7f679d0 100644 --- a/third_party/aom/aom_dsp/x86/quantize_sse2.c +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -16,29 +16,29 @@ #include "aom/aom_integer.h" static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { -#if CONFIG_HIGHBITDEPTH - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); -#else - return _mm_load_si128((const __m128i *)coeff_ptr); -#endif + if (sizeof(tran_low_t) == 4) { + return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); + } else { + return _mm_load_si128((const __m128i *)coeff_ptr); + } } static INLINE void store_coefficients(__m128i coeff_vals, tran_low_t *coeff_ptr) { -#if CONFIG_HIGHBITDEPTH - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); -#else - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); -#endif + if (sizeof(tran_low_t) == 4) { + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); + } else { + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); + } } void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm index 4570e2ce6..2c67f450f 100644 --- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm @@ -256,4 +256,6 @@ SADNXN4D 4, 16 SADNXN4D 16, 4 SADNXN4D 8, 32 SADNXN4D 32, 8 +SADNXN4D 16, 64 +SADNXN4D 64, 16 %endif diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm index 88d427077..b4cc6abf1 100644 --- a/third_party/aom/aom_dsp/x86/sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm @@ -163,6 +163,10 @@ SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 +%if CONFIG_EXT_PARTITION_TYPES +SAD64XN 16 ; sad64x16_sse2 +SAD64XN 16, 1 ; sad64x16_avg_sse2 +%endif ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -261,6 +265,8 @@ SAD16XN 8, 1 ; sad16x8_avg_sse2 %if CONFIG_EXT_PARTITION_TYPES SAD16XN 4 ; sad_16x4_sse2 SAD16XN 4, 1 ; sad_16x4_avg_sse2 +SAD16XN 64 ; sad_16x64_sse2 +SAD16XN 64, 1 ; sad_16x64_avg_sse2 %endif ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h index 4f7a60c22..1a8fed710 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -15,6 +15,7 @@ #include #include "aom_dsp/txfm_common.h" +#include "aom_dsp/x86/common_avx2.h" #define pair256_set_epi16(a, b) \ _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ @@ -34,135 +35,6 @@ static INLINE void mm256_reverse_epi16(__m256i *u) { *u = _mm256_permute2x128_si256(v, v, 1); } -// Note: in and out could have the same value -static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { - __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); - __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]); - __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]); - __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); - __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); - __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]); - __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]); - __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); - - __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]); - __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]); - __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]); - __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]); - __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]); - __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]); - __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]); - __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]); - - // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b - // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f - // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b - // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f - // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b - // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f - // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b - // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f - - // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b - // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f - // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb - // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf - // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db - // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df - // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb - // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff - - __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2); - __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2); - __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3); - __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3); - __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6); - __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6); - __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7); - __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7); - - __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a); - __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a); - __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b); - __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b); - __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e); - __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e); - __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f); - __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f); - - // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39 - // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b - // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d - // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f - // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79 - // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b - // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d - // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f - - // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9 - // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb - // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd - // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf - // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9 - // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb - // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd - // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff - - tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); - tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); - tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5); - tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5); - tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6); - tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6); - tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); - tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); - - tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c); - tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c); - tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d); - tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d); - tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e); - tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e); - tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f); - tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f); - - // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 - // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 - // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a - // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b - // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c - // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d - // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e - // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f - - // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8 - // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9 - // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa - // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb - // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc - // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd - // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe - // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff - - out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000 - out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001 - out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20); - out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31); - out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20); - out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31); - out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20); - out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31); - - out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20); - out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31); - out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20); - out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31); - out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20); - out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31); - out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); - out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); -} - static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1, const __m256i *cospi) { const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h index e4ac56339..4e6eecd32 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h @@ -16,16 +16,16 @@ // This header file should be put below any x86 intrinsics head file static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { -#if CONFIG_HIGHBITDEPTH - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_storeu_si128((__m128i *)(dst_ptr), out0); - _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); -#else - _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); -#endif // CONFIG_HIGHBITDEPTH + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_storeu_si128((__m128i *)(dst_ptr), out0); + _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); + } else { + _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); + } } #endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index 918844185..211fad3f8 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -382,6 +382,28 @@ unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride, assert(sum >= -255 * 32 * 8); return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); } + +unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 16 * 64); + assert(sum >= -255 * 16 * 64); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} + +unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum, + aom_get16x16var_sse2, 16); + assert(sum <= 255 * 64 * 16); + assert(sum >= -255 * 64 * 16); + return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +} #endif // The 2 unused parameters are place holders for PIC enabled build. @@ -451,7 +473,9 @@ DECLS(ssse3); FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ - FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)) + FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \ + FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \ + FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t)) #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ @@ -543,7 +567,9 @@ DECLS(ssse3); FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ - FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)) + FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \ + FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \ + FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t)) #else #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ -- cgit v1.2.3 From 464c3130fcc3d123a5b2ef4639900e70f73e4417 Mon Sep 17 00:00:00 2001 From: trav90 Date: Thu, 18 Oct 2018 21:55:31 -0500 Subject: [av1] Fix build issues This revision of libaom has some conflicts with our vendor script and build system. A number of new .asm files have the same basename as .c files, which our build system cannot handle. To work around this, I manually renamed the conflicting files in the filesystem and sources.mozbuild. Also add av1_convolve_scale_sse4.c to sources.mozbuild manually. This is needed by the build but for some reason isn't picked up by generate_sources_mozbuild.sh. --- .../aom/aom_dsp/arm/idct16x16_1_add_neon.asm | 201 --- .../aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm | 201 +++ third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm | 1182 ------------------ .../aom/aom_dsp/arm/idct16x16_add_neon_asm.asm | 1182 ++++++++++++++++++ .../aom/aom_dsp/arm/idct32x32_1_add_neon.asm | 147 --- .../aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm | 147 +++ third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm | 1302 -------------------- .../aom/aom_dsp/arm/idct32x32_add_neon_asm.asm | 1302 ++++++++++++++++++++ third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm | 71 -- .../aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm | 71 ++ third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm | 193 --- .../aom/aom_dsp/arm/idct4x4_add_neon_asm.asm | 193 +++ third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm | 91 -- .../aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm | 91 ++ third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm | 522 -------- .../aom/aom_dsp/arm/idct8x8_add_neon_asm.asm | 522 ++++++++ third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm | 202 --- .../aom/aom_dsp/arm/loopfilter_16_neon_asm.asm | 202 +++ third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm | 252 ---- .../aom/aom_dsp/arm/loopfilter_4_neon_asm.asm | 252 ++++ third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm | 428 ------- .../aom/aom_dsp/arm/loopfilter_8_neon_asm.asm | 428 +++++++ .../aom/aom_dsp/x86/highbd_intrapred_sse2.asm | 259 ---- .../aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm | 259 ++++ third_party/aom/aom_dsp/x86/intrapred_sse2.asm | 625 ---------- third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm | 625 ++++++++++ third_party/aom/aom_dsp/x86/intrapred_ssse3.asm | 410 ------ .../aom/aom_dsp/x86/intrapred_ssse3_asm.asm | 410 ++++++ 28 files changed, 5885 insertions(+), 5885 deletions(-) delete mode 100644 third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm create mode 100644 third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm (limited to 'third_party/aom/aom_dsp') diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm deleted file mode 100644 index d01c4bc03..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.asm +++ /dev/null @@ -1,201 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - - EXPORT |aom_idct16x16_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct16x16_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asr r0, r0, #6 ; >> 6 - - vdup.s16 q0, r0 ; duplicate a1 - mov r0, #8 - sub r2, #8 - - ; load destination data row0 - row3 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row4 - row7 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row8 - row11 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row12 - row15 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |aom_idct16x16_1_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm new file mode 100644 index 000000000..d01c4bc03 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm @@ -0,0 +1,201 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + + EXPORT |aom_idct16x16_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct16x16_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 6) + add r0, r0, #32 ; + (1 <<((6) - 1)) + asr r0, r0, #6 ; >> 6 + + vdup.s16 q0, r0 ; duplicate a1 + mov r0, #8 + sub r2, #8 + + ; load destination data row0 - row3 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row4 - row7 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row8 - row11 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + ; load destination data row12 - row15 + vld1.64 {d2}, [r1], r0 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r0 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r0 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r0 + vld1.64 {d17}, [r1], r2 + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r0 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r0 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |aom_idct16x16_1_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm deleted file mode 100644 index 4a8f8f183..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.asm +++ /dev/null @@ -1,1182 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_idct16x16_256_add_neon_pass1| - EXPORT |aom_idct16x16_256_add_neon_pass2| - EXPORT |aom_idct16x16_10_add_neon_pass1| - EXPORT |aom_idct16x16_10_add_neon_pass2| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void |aom_idct16x16_256_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) -; -; r0 int16_t input -; r1 int16_t *output -; r2 int output_stride) - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_256_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - - ; generate cospi_28_64 = 3196 - mov r3, #0xc00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r12, #0x3e00 - add r12, #0xc5 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r12 ; duplicate cospi_4_64 - - ; preloading to avoid stall - ; generate cospi_12_64 = 13623 - mov r3, #0x3500 - add r3, #0x37 - - ; generate cospi_20_64 = 9102 - mov r12, #0x2300 - add r12, #0x8e - - ; step2[4] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; step2[4] * cospi_4_64 - vmull.s16 q5, d18, d1 - vmull.s16 q6, d19, d1 - - ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 - vmlal.s16 q5, d30, d0 - vmlal.s16 q6, d31, d0 - - vdup.16 d2, r3 ; duplicate cospi_12_64 - vdup.16 d3, r12 ; duplicate cospi_20_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q5, #14 ; >> 14 - vqrshrn.s32 d15, q6, #14 ; >> 14 - - ; preloading to avoid stall - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - ; generate cospi_24_64 = 6270 - mov r12, #0x1800 - add r12, #0x7e - - ; step2[5] * cospi_12_64 - vmull.s16 q2, d26, d2 - vmull.s16 q3, d27, d2 - - ; step2[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q15, d27, d3 - - ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q2, d22, d3 - vmlsl.s16 q3, d23, d3 - - ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q15, d23, d2 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q2, #14 ; >> 14 - vqrshrn.s32 d11, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q15, #14 ; >> 14 - - ; stage 4 - vdup.16 d30, r3 ; cospi_16_64 - - ; step1[0] * cospi_16_64 - vmull.s16 q2, d16, d30 - vmull.s16 q11, d17, d30 - - ; step1[1] * cospi_16_64 - vmull.s16 q0, d24, d30 - vmull.s16 q1, d25, d30 - - ; generate cospi_8_64 = 15137 - mov r3, #0x3b00 - add r3, #0x21 - - vdup.16 d30, r12 ; duplicate cospi_24_64 - vdup.16 d31, r3 ; duplicate cospi_8_64 - - ; temp1 = (step1[0] + step1[1]) * cospi_16_64 - vadd.s32 q3, q2, q0 - vadd.s32 q12, q11, q1 - - ; temp2 = (step1[0] - step1[1]) * cospi_16_64 - vsub.s32 q13, q2, q0 - vsub.s32 q1, q11, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d16, q3, #14 ; >> 14 - vqrshrn.s32 d17, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d18, q13, #14 ; >> 14 - vqrshrn.s32 d19, q1, #14 ; >> 14 - - ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - ; step1[2] * cospi_8_64 - vmull.s16 q0, d20, d31 - vmull.s16 q1, d21, d31 - - ; step1[2] * cospi_24_64 - vmull.s16 q12, d20, d30 - vmull.s16 q13, d21, d30 - - ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q0, d28, d30 - vmlal.s16 q1, d29, d30 - - ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q12, d28, d31 - vmlsl.s16 q13, d29, d31 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d22, q0, #14 ; >> 14 - vqrshrn.s32 d23, q1, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d20, q12, #14 ; >> 14 - vqrshrn.s32 d21, q13, #14 ; >> 14 - - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; - vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; - - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - ; stage 5 - vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; - vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; - vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; - vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; - - vdup.16 d16, r3; ; duplicate cospi_16_64 - - ; step2[5] * cospi_16_64 - vmull.s16 q11, d26, d16 - vmull.s16 q12, d27, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q6, q9, q11 - vsub.s32 q13, q10, q12 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q6, #14 ; >> 14 - vqrshrn.s32 d11, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {d16}, [r1], r2 - vst1.64 {d17}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 - - bx lr - ENDP ; |aom_idct16x16_256_add_neon_pass1| - -;void aom_idct16x16_256_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) -; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_256_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - - ; generate cospi_30_64 = 1606 - mov r3, #0x0600 - add r3, #0x46 - - ; generate cospi_2_64 = 16305 - mov r12, #0x3f00 - add r12, #0xb1 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d12, r3 ; duplicate cospi_30_64 - vdup.16 d13, r12 ; duplicate cospi_2_64 - - ; preloading to avoid stall - ; generate cospi_14_64 = 12665 - mov r3, #0x3100 - add r3, #0x79 - - ; generate cospi_18_64 = 10394 - mov r12, #0x2800 - add r12, #0x9a - - ; step1[8] * cospi_30_64 - vmull.s16 q2, d16, d12 - vmull.s16 q3, d17, d12 - - ; step1[8] * cospi_2_64 - vmull.s16 q1, d16, d13 - vmull.s16 q4, d17, d13 - - ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 - vmlsl.s16 q2, d30, d13 - vmlsl.s16 q3, d31, d13 - - ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 - vmlal.s16 q1, d30, d12 - vmlal.s16 q4, d31, d12 - - vdup.16 d30, r3 ; duplicate cospi_14_64 - vdup.16 d31, r12 ; duplicate cospi_18_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d0, q2, #14 ; >> 14 - vqrshrn.s32 d1, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q1, #14 ; >> 14 - vqrshrn.s32 d15, q4, #14 ; >> 14 - - ; preloading to avoid stall - ; generate cospi_22_64 = 7723 - mov r3, #0x1e00 - add r3, #0x2b - - ; generate cospi_10_64 = 14449 - mov r12, #0x3800 - add r12, #0x71 - - ; step1[9] * cospi_14_64 - vmull.s16 q2, d24, d30 - vmull.s16 q3, d25, d30 - - ; step1[9] * cospi_18_64 - vmull.s16 q4, d24, d31 - vmull.s16 q5, d25, d31 - - ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 - vmlsl.s16 q2, d22, d31 - vmlsl.s16 q3, d23, d31 - - ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 - vmlal.s16 q4, d22, d30 - vmlal.s16 q5, d23, d30 - - vdup.16 d30, r3 ; duplicate cospi_22_64 - vdup.16 d31, r12 ; duplicate cospi_10_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q2, #14 ; >> 14 - vqrshrn.s32 d3, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q4, #14 ; >> 14 - vqrshrn.s32 d13, q5, #14 ; >> 14 - - ; step1[10] * cospi_22_64 - vmull.s16 q11, d20, d30 - vmull.s16 q12, d21, d30 - - ; step1[10] * cospi_10_64 - vmull.s16 q4, d20, d31 - vmull.s16 q5, d21, d31 - - ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 - vmlsl.s16 q11, d26, d31 - vmlsl.s16 q12, d27, d31 - - ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 - vmlal.s16 q4, d26, d30 - vmlal.s16 q5, d27, d30 - - ; preloading to avoid stall - ; generate cospi_6_64 = 15679 - mov r3, #0x3d00 - add r3, #0x3f - - ; generate cospi_26_64 = 4756 - mov r12, #0x1200 - add r12, #0x94 - - vdup.16 d30, r3 ; duplicate cospi_6_64 - vdup.16 d31, r12 ; duplicate cospi_26_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d11, q5, #14 ; >> 14 - vqrshrn.s32 d10, q4, #14 ; >> 14 - - ; step1[11] * cospi_6_64 - vmull.s16 q10, d28, d30 - vmull.s16 q11, d29, d30 - - ; step1[11] * cospi_26_64 - vmull.s16 q12, d28, d31 - vmull.s16 q13, d29, d31 - - ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 - vmlsl.s16 q10, d18, d31 - vmlsl.s16 q11, d19, d31 - - ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 - vmlal.s16 q12, d18, d30 - vmlal.s16 q13, d19, d30 - - vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] - vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q11, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q12, #14 ; >> 14 - vqrshrn.s32 d9, q13, #14 ; >> 14 - - ; stage 3 - vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] - vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] - vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] - vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] - vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] - vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] - - ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e - - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 - - ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vdup.16 d30, r12 ; duplicate cospi_8_64 - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d18, d31 - vmull.s16 q3, d19, d31 - - ; step1[14] * cospi_24_64 - vmull.s16 q4, d28, d31 - vmull.s16 q5, d29, d31 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d28, d30 - vmlal.s16 q3, d29, d30 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q4, d18, d30 - vmlsl.s16 q5, d19, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q4, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 - - vmov.s16 q3, q11 - vmov.s16 q4, q12 - - ; - step1[13] * cospi_8_64 - vmull.s16 q11, d26, d30 - vmull.s16 q12, d27, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d20, d30 - vmull.s16 q9, d21, d30 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlsl.s16 q11, d20, d31 - vmlsl.s16 q12, d21, d31 - - ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d26, d31 - vmlal.s16 q9, d27, d31 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q10, q3, q0 - vadd.s32 q4, q4, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q10, #14 ; >> 14 - vqrshrn.s32 d11, q4, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1Output stride - ldr r3, [sp] ; load skip_adding - cmp r3, #0 ; check if need adding dest data - beq skip_adding_dest - - ldr r7, [sp, #28] ; dest used to save element 0-7 - mov r9, r7 ; save dest pointer for later use - ldr r8, [sp, #32] ; load dest_stride - - ; stage 7 - ; load the data in pass1 - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - - ; store the data output 8,9,10,11,12,13,14,15 - vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q8 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q9, q9, #6 - vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q9 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q2, q2, #6 - vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q2 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q3, q3, #6 - vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q3 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q4, q4, #6 - vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q4 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q5, q5, #6 - vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q5 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q14, q14, #6 - vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q14 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q15, q15, #6 - vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q15 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - b end_idct16x16_pass2 - -skip_adding_dest - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |aom_idct16x16_256_add_neon_pass2| - -;void |aom_idct16x16_10_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) -; -; r0 int16_t input -; r1 int16_t *output -; r2 int output_stride) - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_10_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - - ; generate cospi_28_64*2 = 6392 - mov r3, #0x1800 - add r3, #0xf8 - - ; generate cospi_4_64*2 = 32138 - mov r12, #0x7d00 - add r12, #0x8a - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q0, r3 ; duplicate cospi_28_64*2 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, - ; double, and return the high 16 bits, effectively giving >> 15. Doubling - ; the constant will change this to >> 14. - ; dct_const_round_shift(step2[4] * cospi_28_64); - vqrdmulh.s16 q4, q9, q0 - - ; preloading to avoid stall - ; generate cospi_16_64*2 = 23170 - mov r3, #0x5a00 - add r3, #0x82 - - ; dct_const_round_shift(step2[4] * cospi_4_64); - vqrdmulh.s16 q7, q9, q1 - - ; stage 4 - vdup.16 q1, r3 ; cospi_16_64*2 - - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - vdup.16 d4, r3; ; duplicate cospi_16_64 - - ; dct_const_round_shift(step1[0] * cospi_16_64) - vqrdmulh.s16 q8, q8, q1 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d14, d4 - vmull.s16 q10, d15, d4 - - ; step2[5] * cospi_16_64 - vmull.s16 q12, d9, d4 - vmull.s16 q11, d8, d4 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q15, q10, q12 - vsub.s32 q6, q9, q11 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d11, q15, #14 ; >> 14 - vqrshrn.s32 d10, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; - vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; - vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {d4}, [r1], r2 - vst1.64 {d5}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 - - bx lr - ENDP ; |aom_idct16x16_10_add_neon_pass1| - -;void aom_idct16x16_10_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) -; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_10_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - - ; generate 2*cospi_30_64 = 3212 - mov r3, #0xc00 - add r3, #0x8c - - ; generate 2*cospi_2_64 = 32610 - mov r12, #0x7f00 - add r12, #0x62 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q6, r3 ; duplicate 2*cospi_30_64 - - ; dct_const_round_shift(step1[8] * cospi_30_64) - vqrdmulh.s16 q0, q8, q6 - - vdup.16 q6, r12 ; duplicate 2*cospi_2_64 - - ; dct_const_round_shift(step1[8] * cospi_2_64) - vqrdmulh.s16 q7, q8, q6 - - ; preloading to avoid stall - ; generate 2*cospi_26_64 = 9512 - mov r12, #0x2500 - add r12, #0x28 - rsb r12, #0 - vdup.16 q15, r12 ; duplicate -2*cospi_26_64 - - ; generate 2*cospi_6_64 = 31358 - mov r3, #0x7a00 - add r3, #0x7e - vdup.16 q14, r3 ; duplicate 2*cospi_6_64 - - ; dct_const_round_shift(- step1[12] * cospi_26_64) - vqrdmulh.s16 q3, q9, q15 - - ; dct_const_round_shift(step1[12] * cospi_6_64) - vqrdmulh.s16 q4, q9, q14 - - ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 - vdup.16 d30, r12 ; duplicate cospi_8_64 - - ; step1[14] * cospi_24_64 - vmull.s16 q12, d14, d31 - vmull.s16 q5, d15, d31 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d0, d31 - vmull.s16 q11, d1, d31 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q12, d0, d30 - vmlsl.s16 q5, d1, d30 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d14, d30 - vmlal.s16 q11, d15, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q12, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q11, #14 ; >> 14 - - ; - step1[13] * cospi_8_64 - vmull.s16 q10, d8, d30 - vmull.s16 q13, d9, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d6, d30 - vmull.s16 q9, d7, d30 - - ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 - vmlsl.s16 q10, d6, d31 - vmlsl.s16 q13, d7, d31 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d8, d31 - vmlal.s16 q9, d9, d31 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q10, #14 ; >> 14 - vqrshrn.s32 d5, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q0, q3, q0 - vadd.s32 q1, q4, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q0, #14 ; >> 14 - vqrshrn.s32 d11, q1, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1Output stride - ldr r3, [sp] ; load skip_adding - - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct10_16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |aom_idct16x16_10_add_neon_pass2| - END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm new file mode 100644 index 000000000..4a8f8f183 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm @@ -0,0 +1,1182 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_idct16x16_256_add_neon_pass1| + EXPORT |aom_idct16x16_256_add_neon_pass2| + EXPORT |aom_idct16x16_10_add_neon_pass1| + EXPORT |aom_idct16x16_10_add_neon_pass2| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void |aom_idct16x16_256_add_neon_pass1|(int16_t *input, +; int16_t *output, int output_stride) +; +; r0 int16_t input +; r1 int16_t *output +; r2 int output_stride) + +; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_256_add_neon_pass1| PROC + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q1,q2}, [r0]! + vmov.s16 q15, q1 + + ; generate cospi_28_64 = 3196 + mov r3, #0xc00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r12, #0x3e00 + add r12, #0xc5 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r12 ; duplicate cospi_4_64 + + ; preloading to avoid stall + ; generate cospi_12_64 = 13623 + mov r3, #0x3500 + add r3, #0x37 + + ; generate cospi_20_64 = 9102 + mov r12, #0x2300 + add r12, #0x8e + + ; step2[4] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; step2[4] * cospi_4_64 + vmull.s16 q5, d18, d1 + vmull.s16 q6, d19, d1 + + ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 + vmlal.s16 q5, d30, d0 + vmlal.s16 q6, d31, d0 + + vdup.16 d2, r3 ; duplicate cospi_12_64 + vdup.16 d3, r12 ; duplicate cospi_20_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d14, q5, #14 ; >> 14 + vqrshrn.s32 d15, q6, #14 ; >> 14 + + ; preloading to avoid stall + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + ; generate cospi_24_64 = 6270 + mov r12, #0x1800 + add r12, #0x7e + + ; step2[5] * cospi_12_64 + vmull.s16 q2, d26, d2 + vmull.s16 q3, d27, d2 + + ; step2[5] * cospi_20_64 + vmull.s16 q9, d26, d3 + vmull.s16 q15, d27, d3 + + ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q2, d22, d3 + vmlsl.s16 q3, d23, d3 + + ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 + vmlal.s16 q9, d22, d2 + vmlal.s16 q15, d23, d2 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q2, #14 ; >> 14 + vqrshrn.s32 d11, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q15, #14 ; >> 14 + + ; stage 4 + vdup.16 d30, r3 ; cospi_16_64 + + ; step1[0] * cospi_16_64 + vmull.s16 q2, d16, d30 + vmull.s16 q11, d17, d30 + + ; step1[1] * cospi_16_64 + vmull.s16 q0, d24, d30 + vmull.s16 q1, d25, d30 + + ; generate cospi_8_64 = 15137 + mov r3, #0x3b00 + add r3, #0x21 + + vdup.16 d30, r12 ; duplicate cospi_24_64 + vdup.16 d31, r3 ; duplicate cospi_8_64 + + ; temp1 = (step1[0] + step1[1]) * cospi_16_64 + vadd.s32 q3, q2, q0 + vadd.s32 q12, q11, q1 + + ; temp2 = (step1[0] - step1[1]) * cospi_16_64 + vsub.s32 q13, q2, q0 + vsub.s32 q1, q11, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d16, q3, #14 ; >> 14 + vqrshrn.s32 d17, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d18, q13, #14 ; >> 14 + vqrshrn.s32 d19, q1, #14 ; >> 14 + + ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + ; step1[2] * cospi_8_64 + vmull.s16 q0, d20, d31 + vmull.s16 q1, d21, d31 + + ; step1[2] * cospi_24_64 + vmull.s16 q12, d20, d30 + vmull.s16 q13, d21, d30 + + ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q0, d28, d30 + vmlal.s16 q1, d29, d30 + + ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlsl.s16 q12, d28, d31 + vmlsl.s16 q13, d29, d31 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d22, q0, #14 ; >> 14 + vqrshrn.s32 d23, q1, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d20, q12, #14 ; >> 14 + vqrshrn.s32 d21, q13, #14 ; >> 14 + + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; + vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; + + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + ; stage 5 + vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; + vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; + vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; + vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; + + vdup.16 d16, r3; ; duplicate cospi_16_64 + + ; step2[5] * cospi_16_64 + vmull.s16 q11, d26, d16 + vmull.s16 q12, d27, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; temp1 = (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q6, q9, q11 + vsub.s32 q13, q10, q12 + + ; temp2 = (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q6, #14 ; >> 14 + vqrshrn.s32 d11, q13, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 6 + vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; + + ; store the data + vst1.64 {d16}, [r1], r2 + vst1.64 {d17}, [r1], r2 + vst1.64 {d18}, [r1], r2 + vst1.64 {d19}, [r1], r2 + vst1.64 {d20}, [r1], r2 + vst1.64 {d21}, [r1], r2 + vst1.64 {d22}, [r1], r2 + vst1.64 {d23}, [r1], r2 + vst1.64 {d24}, [r1], r2 + vst1.64 {d25}, [r1], r2 + vst1.64 {d26}, [r1], r2 + vst1.64 {d27}, [r1], r2 + vst1.64 {d28}, [r1], r2 + vst1.64 {d29}, [r1], r2 + vst1.64 {d30}, [r1], r2 + vst1.64 {d31}, [r1], r2 + + bx lr + ENDP ; |aom_idct16x16_256_add_neon_pass1| + +;void aom_idct16x16_256_add_neon_pass2(int16_t *src, +; int16_t *output, +; int16_t *pass1Output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 int16_t *src +; r1 int16_t *output, +; r2 int16_t *pass1Output, +; r3 int16_t skip_adding, +; r4 uint8_t *dest, +; r5 int dest_stride) + +; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_256_add_neon_pass2| PROC + push {r3-r9} + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q0,q1}, [r0]! + vmov.s16 q15, q0; + + ; generate cospi_30_64 = 1606 + mov r3, #0x0600 + add r3, #0x46 + + ; generate cospi_2_64 = 16305 + mov r12, #0x3f00 + add r12, #0xb1 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 d12, r3 ; duplicate cospi_30_64 + vdup.16 d13, r12 ; duplicate cospi_2_64 + + ; preloading to avoid stall + ; generate cospi_14_64 = 12665 + mov r3, #0x3100 + add r3, #0x79 + + ; generate cospi_18_64 = 10394 + mov r12, #0x2800 + add r12, #0x9a + + ; step1[8] * cospi_30_64 + vmull.s16 q2, d16, d12 + vmull.s16 q3, d17, d12 + + ; step1[8] * cospi_2_64 + vmull.s16 q1, d16, d13 + vmull.s16 q4, d17, d13 + + ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 + vmlsl.s16 q2, d30, d13 + vmlsl.s16 q3, d31, d13 + + ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 + vmlal.s16 q1, d30, d12 + vmlal.s16 q4, d31, d12 + + vdup.16 d30, r3 ; duplicate cospi_14_64 + vdup.16 d31, r12 ; duplicate cospi_18_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d0, q2, #14 ; >> 14 + vqrshrn.s32 d1, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d14, q1, #14 ; >> 14 + vqrshrn.s32 d15, q4, #14 ; >> 14 + + ; preloading to avoid stall + ; generate cospi_22_64 = 7723 + mov r3, #0x1e00 + add r3, #0x2b + + ; generate cospi_10_64 = 14449 + mov r12, #0x3800 + add r12, #0x71 + + ; step1[9] * cospi_14_64 + vmull.s16 q2, d24, d30 + vmull.s16 q3, d25, d30 + + ; step1[9] * cospi_18_64 + vmull.s16 q4, d24, d31 + vmull.s16 q5, d25, d31 + + ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 + vmlsl.s16 q2, d22, d31 + vmlsl.s16 q3, d23, d31 + + ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 + vmlal.s16 q4, d22, d30 + vmlal.s16 q5, d23, d30 + + vdup.16 d30, r3 ; duplicate cospi_22_64 + vdup.16 d31, r12 ; duplicate cospi_10_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q2, #14 ; >> 14 + vqrshrn.s32 d3, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q4, #14 ; >> 14 + vqrshrn.s32 d13, q5, #14 ; >> 14 + + ; step1[10] * cospi_22_64 + vmull.s16 q11, d20, d30 + vmull.s16 q12, d21, d30 + + ; step1[10] * cospi_10_64 + vmull.s16 q4, d20, d31 + vmull.s16 q5, d21, d31 + + ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 + vmlsl.s16 q11, d26, d31 + vmlsl.s16 q12, d27, d31 + + ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 + vmlal.s16 q4, d26, d30 + vmlal.s16 q5, d27, d30 + + ; preloading to avoid stall + ; generate cospi_6_64 = 15679 + mov r3, #0x3d00 + add r3, #0x3f + + ; generate cospi_26_64 = 4756 + mov r12, #0x1200 + add r12, #0x94 + + vdup.16 d30, r3 ; duplicate cospi_6_64 + vdup.16 d31, r12 ; duplicate cospi_26_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q11, #14 ; >> 14 + vqrshrn.s32 d5, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d11, q5, #14 ; >> 14 + vqrshrn.s32 d10, q4, #14 ; >> 14 + + ; step1[11] * cospi_6_64 + vmull.s16 q10, d28, d30 + vmull.s16 q11, d29, d30 + + ; step1[11] * cospi_26_64 + vmull.s16 q12, d28, d31 + vmull.s16 q13, d29, d31 + + ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 + vmlsl.s16 q10, d18, d31 + vmlsl.s16 q11, d19, d31 + + ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 + vmlal.s16 q12, d18, d30 + vmlal.s16 q13, d19, d30 + + vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] + vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q11, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d8, q12, #14 ; >> 14 + vqrshrn.s32 d9, q13, #14 ; >> 14 + + ; stage 3 + vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] + vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] + vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] + vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] + vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] + vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] + + ; stage 4 + ; generate cospi_24_64 = 6270 + mov r3, #0x1800 + add r3, #0x7e + + ; generate cospi_8_64 = 15137 + mov r12, #0x3b00 + add r12, #0x21 + + ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vdup.16 d30, r12 ; duplicate cospi_8_64 + vdup.16 d31, r3 ; duplicate cospi_24_64 + + ; step1[9] * cospi_24_64 + vmull.s16 q2, d18, d31 + vmull.s16 q3, d19, d31 + + ; step1[14] * cospi_24_64 + vmull.s16 q4, d28, d31 + vmull.s16 q5, d29, d31 + + ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 + vmlal.s16 q2, d28, d30 + vmlal.s16 q3, d29, d30 + + ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vmlsl.s16 q4, d18, d30 + vmlsl.s16 q5, d19, d30 + + rsb r12, #0 + vdup.16 d30, r12 ; duplicate -cospi_8_64 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q2, #14 ; >> 14 + vqrshrn.s32 d13, q3, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q4, #14 ; >> 14 + vqrshrn.s32 d3, q5, #14 ; >> 14 + + vmov.s16 q3, q11 + vmov.s16 q4, q12 + + ; - step1[13] * cospi_8_64 + vmull.s16 q11, d26, d30 + vmull.s16 q12, d27, d30 + + ; -step1[10] * cospi_8_64 + vmull.s16 q8, d20, d30 + vmull.s16 q9, d21, d30 + + ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlsl.s16 q11, d20, d31 + vmlsl.s16 q12, d21, d31 + + ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlal.s16 q8, d26, d31 + vmlal.s16 q9, d27, d31 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d4, q11, #14 ; >> 14 + vqrshrn.s32 d5, q12, #14 ; >> 14 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d10, q8, #14 ; >> 14 + vqrshrn.s32 d11, q9, #14 ; >> 14 + + ; stage 5 + vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; + vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; + vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; + vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; + vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; + vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; + vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; + vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; + + ; stage 6. + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + vdup.16 d14, r12 ; duplicate cospi_16_64 + + ; step1[13] * cospi_16_64 + vmull.s16 q3, d26, d14 + vmull.s16 q4, d27, d14 + + ; step1[10] * cospi_16_64 + vmull.s16 q0, d20, d14 + vmull.s16 q1, d21, d14 + + ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 + vsub.s32 q5, q3, q0 + vsub.s32 q6, q4, q1 + + ; temp2 = (step1[10] + step1[13]) * cospi_16_64 + vadd.s32 q10, q3, q0 + vadd.s32 q4, q4, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q5, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q10, #14 ; >> 14 + vqrshrn.s32 d11, q4, #14 ; >> 14 + + ; step1[11] * cospi_16_64 + vmull.s16 q0, d22, d14 + vmull.s16 q1, d23, d14 + + ; step1[12] * cospi_16_64 + vmull.s16 q13, d24, d14 + vmull.s16 q6, d25, d14 + + ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 + vsub.s32 q10, q13, q0 + vsub.s32 q4, q6, q1 + + ; temp2 = (step1[11] + step1[12]) * cospi_16_64 + vadd.s32 q13, q13, q0 + vadd.s32 q6, q6, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d8, q13, #14 ; >> 14 + vqrshrn.s32 d9, q6, #14 ; >> 14 + + mov r4, #16 ; pass1Output stride + ldr r3, [sp] ; load skip_adding + cmp r3, #0 ; check if need adding dest data + beq skip_adding_dest + + ldr r7, [sp, #28] ; dest used to save element 0-7 + mov r9, r7 ; save dest pointer for later use + ldr r8, [sp, #32] ; load dest_stride + + ; stage 7 + ; load the data in pass1 + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO + vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q12 ; clip pixel + vqmovun.s16 d13, q13 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + + ; store the data output 8,9,10,11,12,13,14,15 + vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO + vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q8 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q9, q9, #6 + vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q9 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q2, q2, #6 + vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q2 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q3, q3, #6 + vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q3 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q4, q4, #6 + vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q4 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q5, q5, #6 + vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q5 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + vld1.64 {d13}, [r7], r8 ; load destinatoin data + vrshr.s16 q14, q14, #6 + vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] + vqmovun.s16 d12, q14 ; clip pixel + vst1.64 {d12}, [r9], r8 ; store the data + vld1.64 {d12}, [r7], r8 ; load destinatoin data + vrshr.s16 q15, q15, #6 + vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] + vqmovun.s16 d13, q15 ; clip pixel + vst1.64 {d13}, [r9], r8 ; store the data + b end_idct16x16_pass2 + +skip_adding_dest + ; stage 7 + ; load the data in pass1 + mov r5, #24 + mov r3, #8 + + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vst1.64 {d24}, [r1], r3 ; store output[0] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[1] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vst1.64 {d24}, [r1], r3 ; store output[2] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[3] + vst1.64 {d27}, [r1], r5 + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vst1.64 {d24}, [r1], r3 ; store output[4] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[5] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + vst1.64 {d24}, [r1], r3 ; store output[6] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[7] + vst1.64 {d27}, [r1], r5 + + ; store the data output 8,9,10,11,12,13,14,15 + vst1.64 {d16}, [r1], r3 + vst1.64 {d17}, [r1], r5 + vst1.64 {d18}, [r1], r3 + vst1.64 {d19}, [r1], r5 + vst1.64 {d4}, [r1], r3 + vst1.64 {d5}, [r1], r5 + vst1.64 {d6}, [r1], r3 + vst1.64 {d7}, [r1], r5 + vst1.64 {d8}, [r1], r3 + vst1.64 {d9}, [r1], r5 + vst1.64 {d10}, [r1], r3 + vst1.64 {d11}, [r1], r5 + vst1.64 {d28}, [r1], r3 + vst1.64 {d29}, [r1], r5 + vst1.64 {d30}, [r1], r3 + vst1.64 {d31}, [r1], r5 +end_idct16x16_pass2 + pop {r3-r9} + bx lr + ENDP ; |aom_idct16x16_256_add_neon_pass2| + +;void |aom_idct16x16_10_add_neon_pass1|(int16_t *input, +; int16_t *output, int output_stride) +; +; r0 int16_t input +; r1 int16_t *output +; r2 int output_stride) + +; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_10_add_neon_pass1| PROC + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q1,q2}, [r0]! + vmov.s16 q15, q1 + + ; generate cospi_28_64*2 = 6392 + mov r3, #0x1800 + add r3, #0xf8 + + ; generate cospi_4_64*2 = 32138 + mov r12, #0x7d00 + add r12, #0x8a + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 q0, r3 ; duplicate cospi_28_64*2 + vdup.16 q1, r12 ; duplicate cospi_4_64*2 + + ; The following instructions use vqrdmulh to do the + ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, + ; double, and return the high 16 bits, effectively giving >> 15. Doubling + ; the constant will change this to >> 14. + ; dct_const_round_shift(step2[4] * cospi_28_64); + vqrdmulh.s16 q4, q9, q0 + + ; preloading to avoid stall + ; generate cospi_16_64*2 = 23170 + mov r3, #0x5a00 + add r3, #0x82 + + ; dct_const_round_shift(step2[4] * cospi_4_64); + vqrdmulh.s16 q7, q9, q1 + + ; stage 4 + vdup.16 q1, r3 ; cospi_16_64*2 + + ; generate cospi_16_64 = 11585 + mov r3, #0x2d00 + add r3, #0x41 + + vdup.16 d4, r3; ; duplicate cospi_16_64 + + ; dct_const_round_shift(step1[0] * cospi_16_64) + vqrdmulh.s16 q8, q8, q1 + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d14, d4 + vmull.s16 q10, d15, d4 + + ; step2[5] * cospi_16_64 + vmull.s16 q12, d9, d4 + vmull.s16 q11, d8, d4 + + ; temp1 = (step2[6] - step2[5]) * cospi_16_64 + vsub.s32 q15, q10, q12 + vsub.s32 q6, q9, q11 + + ; temp2 = (step2[5] + step2[6]) * cospi_16_64 + vadd.s32 q9, q9, q11 + vadd.s32 q10, q10, q12 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d11, q15, #14 ; >> 14 + vqrshrn.s32 d10, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q10, #14 ; >> 14 + + ; stage 6 + vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; + vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; + vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; + vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; + vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; + vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; + vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; + + ; store the data + vst1.64 {d4}, [r1], r2 + vst1.64 {d5}, [r1], r2 + vst1.64 {d18}, [r1], r2 + vst1.64 {d19}, [r1], r2 + vst1.64 {d20}, [r1], r2 + vst1.64 {d21}, [r1], r2 + vst1.64 {d22}, [r1], r2 + vst1.64 {d23}, [r1], r2 + vst1.64 {d24}, [r1], r2 + vst1.64 {d25}, [r1], r2 + vst1.64 {d26}, [r1], r2 + vst1.64 {d27}, [r1], r2 + vst1.64 {d28}, [r1], r2 + vst1.64 {d29}, [r1], r2 + vst1.64 {d30}, [r1], r2 + vst1.64 {d31}, [r1], r2 + + bx lr + ENDP ; |aom_idct16x16_10_add_neon_pass1| + +;void aom_idct16x16_10_add_neon_pass2(int16_t *src, +; int16_t *output, +; int16_t *pass1Output, +; int16_t skip_adding, +; uint8_t *dest, +; int dest_stride) +; +; r0 int16_t *src +; r1 int16_t *output, +; r2 int16_t *pass1Output, +; r3 int16_t skip_adding, +; r4 uint8_t *dest, +; r5 int dest_stride) + +; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output +; will be stored back into q8-q15 registers. This function will touch q0-q7 +; registers and use them as buffer during calculation. +|aom_idct16x16_10_add_neon_pass2| PROC + push {r3-r9} + + ; TODO(hkuang): Find a better way to load the elements. + ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 + vld2.s16 {q8,q9}, [r0]! + vld2.s16 {q9,q10}, [r0]! + vld2.s16 {q10,q11}, [r0]! + vld2.s16 {q11,q12}, [r0]! + vld2.s16 {q12,q13}, [r0]! + vld2.s16 {q13,q14}, [r0]! + vld2.s16 {q14,q15}, [r0]! + vld2.s16 {q0,q1}, [r0]! + vmov.s16 q15, q0; + + ; generate 2*cospi_30_64 = 3212 + mov r3, #0xc00 + add r3, #0x8c + + ; generate 2*cospi_2_64 = 32610 + mov r12, #0x7f00 + add r12, #0x62 + + ; transpose the input data + TRANSPOSE8X8 + + ; stage 3 + vdup.16 q6, r3 ; duplicate 2*cospi_30_64 + + ; dct_const_round_shift(step1[8] * cospi_30_64) + vqrdmulh.s16 q0, q8, q6 + + vdup.16 q6, r12 ; duplicate 2*cospi_2_64 + + ; dct_const_round_shift(step1[8] * cospi_2_64) + vqrdmulh.s16 q7, q8, q6 + + ; preloading to avoid stall + ; generate 2*cospi_26_64 = 9512 + mov r12, #0x2500 + add r12, #0x28 + rsb r12, #0 + vdup.16 q15, r12 ; duplicate -2*cospi_26_64 + + ; generate 2*cospi_6_64 = 31358 + mov r3, #0x7a00 + add r3, #0x7e + vdup.16 q14, r3 ; duplicate 2*cospi_6_64 + + ; dct_const_round_shift(- step1[12] * cospi_26_64) + vqrdmulh.s16 q3, q9, q15 + + ; dct_const_round_shift(step1[12] * cospi_6_64) + vqrdmulh.s16 q4, q9, q14 + + ; stage 4 + ; generate cospi_24_64 = 6270 + mov r3, #0x1800 + add r3, #0x7e + vdup.16 d31, r3 ; duplicate cospi_24_64 + + ; generate cospi_8_64 = 15137 + mov r12, #0x3b00 + add r12, #0x21 + vdup.16 d30, r12 ; duplicate cospi_8_64 + + ; step1[14] * cospi_24_64 + vmull.s16 q12, d14, d31 + vmull.s16 q5, d15, d31 + + ; step1[9] * cospi_24_64 + vmull.s16 q2, d0, d31 + vmull.s16 q11, d1, d31 + + ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 + vmlsl.s16 q12, d0, d30 + vmlsl.s16 q5, d1, d30 + + ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 + vmlal.s16 q2, d14, d30 + vmlal.s16 q11, d15, d30 + + rsb r12, #0 + vdup.16 d30, r12 ; duplicate -cospi_8_64 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d2, q12, #14 ; >> 14 + vqrshrn.s32 d3, q5, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d12, q2, #14 ; >> 14 + vqrshrn.s32 d13, q11, #14 ; >> 14 + + ; - step1[13] * cospi_8_64 + vmull.s16 q10, d8, d30 + vmull.s16 q13, d9, d30 + + ; -step1[10] * cospi_8_64 + vmull.s16 q8, d6, d30 + vmull.s16 q9, d7, d30 + + ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 + vmlsl.s16 q10, d6, d31 + vmlsl.s16 q13, d7, d31 + + ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 + vmlal.s16 q8, d8, d31 + vmlal.s16 q9, d9, d31 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q10, #14 ; >> 14 + vqrshrn.s32 d5, q13, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q8, #14 ; >> 14 + vqrshrn.s32 d11, q9, #14 ; >> 14 + + ; stage 5 + vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; + vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; + vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; + vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; + vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; + vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; + vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; + vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; + + ; stage 6. + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + vdup.16 d14, r12 ; duplicate cospi_16_64 + + ; step1[13] * cospi_16_64 + vmull.s16 q3, d26, d14 + vmull.s16 q4, d27, d14 + + ; step1[10] * cospi_16_64 + vmull.s16 q0, d20, d14 + vmull.s16 q1, d21, d14 + + ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 + vsub.s32 q5, q3, q0 + vsub.s32 q6, q4, q1 + + ; temp2 = (step1[10] + step1[13]) * cospi_16_64 + vadd.s32 q0, q3, q0 + vadd.s32 q1, q4, q1 + + ; dct_const_round_shift(temp1) + vqrshrn.s32 d4, q5, #14 ; >> 14 + vqrshrn.s32 d5, q6, #14 ; >> 14 + + ; dct_const_round_shift(temp2) + vqrshrn.s32 d10, q0, #14 ; >> 14 + vqrshrn.s32 d11, q1, #14 ; >> 14 + + ; step1[11] * cospi_16_64 + vmull.s16 q0, d22, d14 + vmull.s16 q1, d23, d14 + + ; step1[12] * cospi_16_64 + vmull.s16 q13, d24, d14 + vmull.s16 q6, d25, d14 + + ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 + vsub.s32 q10, q13, q0 + vsub.s32 q4, q6, q1 + + ; temp2 = (step1[11] + step1[12]) * cospi_16_64 + vadd.s32 q13, q13, q0 + vadd.s32 q6, q6, q1 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d6, q10, #14 ; >> 14 + vqrshrn.s32 d7, q4, #14 ; >> 14 + + ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); + vqrshrn.s32 d8, q13, #14 ; >> 14 + vqrshrn.s32 d9, q6, #14 ; >> 14 + + mov r4, #16 ; pass1Output stride + ldr r3, [sp] ; load skip_adding + + ; stage 7 + ; load the data in pass1 + mov r5, #24 + mov r3, #8 + + vld1.s16 {q0}, [r2], r4 ; load data step2[0] + vld1.s16 {q1}, [r2], r4 ; load data step2[1] + vadd.s16 q12, q0, q15 ; step2[0] + step2[15] + vadd.s16 q13, q1, q14 ; step2[1] + step2[14] + vld1.s16 {q10}, [r2], r4 ; load data step2[2] + vld1.s16 {q11}, [r2], r4 ; load data step2[3] + vst1.64 {d24}, [r1], r3 ; store output[0] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[1] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q5 ; step2[2] + step2[13] + vadd.s16 q13, q11, q4 ; step2[3] + step2[12] + vsub.s16 q14, q1, q14 ; step2[1] - step2[14] + vsub.s16 q15, q0, q15 ; step2[0] - step2[15] + vst1.64 {d24}, [r1], r3 ; store output[2] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[3] + vst1.64 {d27}, [r1], r5 + vsub.s16 q4, q11, q4 ; step2[3] - step2[12] + vsub.s16 q5, q10, q5 ; step2[2] - step2[13] + vld1.s16 {q0}, [r2], r4 ; load data step2[4] + vld1.s16 {q1}, [r2], r4 ; load data step2[5] + vadd.s16 q12, q0, q3 ; step2[4] + step2[11] + vadd.s16 q13, q1, q2 ; step2[5] + step2[10] + vld1.s16 {q10}, [r2], r4 ; load data step2[6] + vld1.s16 {q11}, [r2], r4 ; load data step2[7] + vst1.64 {d24}, [r1], r3 ; store output[4] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[5] + vst1.64 {d27}, [r1], r5 + vadd.s16 q12, q10, q9 ; step2[6] + step2[9] + vadd.s16 q13, q11, q8 ; step2[7] + step2[8] + vsub.s16 q2, q1, q2 ; step2[5] - step2[10] + vsub.s16 q3, q0, q3 ; step2[4] - step2[11] + vsub.s16 q8, q11, q8 ; step2[7] - step2[8] + vsub.s16 q9, q10, q9 ; step2[6] - step2[9] + vst1.64 {d24}, [r1], r3 ; store output[6] + vst1.64 {d25}, [r1], r5 + vst1.64 {d26}, [r1], r3 ; store output[7] + vst1.64 {d27}, [r1], r5 + + ; store the data output 8,9,10,11,12,13,14,15 + vst1.64 {d16}, [r1], r3 + vst1.64 {d17}, [r1], r5 + vst1.64 {d18}, [r1], r3 + vst1.64 {d19}, [r1], r5 + vst1.64 {d4}, [r1], r3 + vst1.64 {d5}, [r1], r5 + vst1.64 {d6}, [r1], r3 + vst1.64 {d7}, [r1], r5 + vst1.64 {d8}, [r1], r3 + vst1.64 {d9}, [r1], r5 + vst1.64 {d10}, [r1], r3 + vst1.64 {d11}, [r1], r5 + vst1.64 {d28}, [r1], r3 + vst1.64 {d29}, [r1], r5 + vst1.64 {d30}, [r1], r3 + vst1.64 {d31}, [r1], r5 +end_idct10_16x16_pass2 + pop {r3-r9} + bx lr + ENDP ; |aom_idct16x16_10_add_neon_pass2| + END diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm deleted file mode 100644 index b04df2d0b..000000000 --- a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.asm +++ /dev/null @@ -1,147 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - EXPORT |aom_idct32x32_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ;TODO(hkuang): put the following macros in a seperate - ;file so other idct function could also use them. - MACRO - LD_16x8 $src, $stride - vld1.8 {q8}, [$src], $stride - vld1.8 {q9}, [$src], $stride - vld1.8 {q10}, [$src], $stride - vld1.8 {q11}, [$src], $stride - vld1.8 {q12}, [$src], $stride - vld1.8 {q13}, [$src], $stride - vld1.8 {q14}, [$src], $stride - vld1.8 {q15}, [$src], $stride - MEND - - MACRO - ADD_DIFF_16x8 $diff - vqadd.u8 q8, q8, $diff - vqadd.u8 q9, q9, $diff - vqadd.u8 q10, q10, $diff - vqadd.u8 q11, q11, $diff - vqadd.u8 q12, q12, $diff - vqadd.u8 q13, q13, $diff - vqadd.u8 q14, q14, $diff - vqadd.u8 q15, q15, $diff - MEND - - MACRO - SUB_DIFF_16x8 $diff - vqsub.u8 q8, q8, $diff - vqsub.u8 q9, q9, $diff - vqsub.u8 q10, q10, $diff - vqsub.u8 q11, q11, $diff - vqsub.u8 q12, q12, $diff - vqsub.u8 q13, q13, $diff - vqsub.u8 q14, q14, $diff - vqsub.u8 q15, q15, $diff - MEND - - MACRO - ST_16x8 $dst, $stride - vst1.8 {q8}, [$dst], $stride - vst1.8 {q9}, [$dst], $stride - vst1.8 {q10},[$dst], $stride - vst1.8 {q11},[$dst], $stride - vst1.8 {q12},[$dst], $stride - vst1.8 {q13},[$dst], $stride - vst1.8 {q14},[$dst], $stride - vst1.8 {q15},[$dst], $stride - MEND - -;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride - -|aom_idct32x32_1_add_neon| PROC - push {lr} - pld [r1] - add r3, r1, #16 ; r3 dest + 16 for second loop - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asrs r0, r0, #6 ; >> 6 - bge diff_positive_32_32 - -diff_negative_32_32 - neg r0, r0 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_negative_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_negative_32_32_loop - pop {pc} - -diff_positive_32_32 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_positive_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_positive_32_32_loop - pop {pc} - - ENDP ; |aom_idct32x32_1_add_neon| - END diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm new file mode 100644 index 000000000..b04df2d0b --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm @@ -0,0 +1,147 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + EXPORT |aom_idct32x32_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ;TODO(hkuang): put the following macros in a seperate + ;file so other idct function could also use them. + MACRO + LD_16x8 $src, $stride + vld1.8 {q8}, [$src], $stride + vld1.8 {q9}, [$src], $stride + vld1.8 {q10}, [$src], $stride + vld1.8 {q11}, [$src], $stride + vld1.8 {q12}, [$src], $stride + vld1.8 {q13}, [$src], $stride + vld1.8 {q14}, [$src], $stride + vld1.8 {q15}, [$src], $stride + MEND + + MACRO + ADD_DIFF_16x8 $diff + vqadd.u8 q8, q8, $diff + vqadd.u8 q9, q9, $diff + vqadd.u8 q10, q10, $diff + vqadd.u8 q11, q11, $diff + vqadd.u8 q12, q12, $diff + vqadd.u8 q13, q13, $diff + vqadd.u8 q14, q14, $diff + vqadd.u8 q15, q15, $diff + MEND + + MACRO + SUB_DIFF_16x8 $diff + vqsub.u8 q8, q8, $diff + vqsub.u8 q9, q9, $diff + vqsub.u8 q10, q10, $diff + vqsub.u8 q11, q11, $diff + vqsub.u8 q12, q12, $diff + vqsub.u8 q13, q13, $diff + vqsub.u8 q14, q14, $diff + vqsub.u8 q15, q15, $diff + MEND + + MACRO + ST_16x8 $dst, $stride + vst1.8 {q8}, [$dst], $stride + vst1.8 {q9}, [$dst], $stride + vst1.8 {q10},[$dst], $stride + vst1.8 {q11},[$dst], $stride + vst1.8 {q12},[$dst], $stride + vst1.8 {q13},[$dst], $stride + vst1.8 {q14},[$dst], $stride + vst1.8 {q15},[$dst], $stride + MEND + +;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride + +|aom_idct32x32_1_add_neon| PROC + push {lr} + pld [r1] + add r3, r1, #16 ; r3 dest + 16 for second loop + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 6) + add r0, r0, #32 ; + (1 <<((6) - 1)) + asrs r0, r0, #6 ; >> 6 + bge diff_positive_32_32 + +diff_negative_32_32 + neg r0, r0 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +diff_negative_32_32_loop + sub r0, #1 + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r12, r2 + + LD_16x8 r1, r2 + SUB_DIFF_16x8 q0 + ST_16x8 r12, r2 + cmp r0, #2 + moveq r1, r3 + moveq r12, r3 + cmp r0, #0 + bne diff_negative_32_32_loop + pop {pc} + +diff_positive_32_32 + usat r0, #8, r0 + vdup.u8 q0, r0 + mov r0, #4 + +diff_positive_32_32_loop + sub r0, #1 + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r12, r2 + + LD_16x8 r1, r2 + ADD_DIFF_16x8 q0 + ST_16x8 r12, r2 + cmp r0, #2 + moveq r1, r3 + moveq r12, r3 + cmp r0, #0 + bne diff_positive_32_32_loop + pop {pc} + + ENDP ; |aom_idct32x32_1_add_neon| + END diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm deleted file mode 100644 index e7793fb16..000000000 --- a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.asm +++ /dev/null @@ -1,1302 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -;TODO(cd): adjust these constant to be able to use vqdmulh for faster -; dct_const_round_shift(a * b) within butterfly calculations. -cospi_1_64 EQU 16364 -cospi_2_64 EQU 16305 -cospi_3_64 EQU 16207 -cospi_4_64 EQU 16069 -cospi_5_64 EQU 15893 -cospi_6_64 EQU 15679 -cospi_7_64 EQU 15426 -cospi_8_64 EQU 15137 -cospi_9_64 EQU 14811 -cospi_10_64 EQU 14449 -cospi_11_64 EQU 14053 -cospi_12_64 EQU 13623 -cospi_13_64 EQU 13160 -cospi_14_64 EQU 12665 -cospi_15_64 EQU 12140 -cospi_16_64 EQU 11585 -cospi_17_64 EQU 11003 -cospi_18_64 EQU 10394 -cospi_19_64 EQU 9760 -cospi_20_64 EQU 9102 -cospi_21_64 EQU 8423 -cospi_22_64 EQU 7723 -cospi_23_64 EQU 7005 -cospi_24_64 EQU 6270 -cospi_25_64 EQU 5520 -cospi_26_64 EQU 4756 -cospi_27_64 EQU 3981 -cospi_28_64 EQU 3196 -cospi_29_64 EQU 2404 -cospi_30_64 EQU 1606 -cospi_31_64 EQU 804 - - - EXPORT |aom_idct32x32_1024_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - AREA Block, CODE, READONLY - - ; -------------------------------------------------------------------------- - ; Load from transposed_buffer - ; q13 = transposed_buffer[first_offset] - ; q14 = transposed_buffer[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; transposed_buffer must be passed in. use 0 for first use. - MACRO - LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset - ; address calculation with proper stride and loading - add r0, #($first_offset - $prev_offset )*8*2 - vld1.s16 {q14}, [r0] - add r0, #($second_offset - $first_offset)*8*2 - vld1.s16 {q13}, [r0] - ; (used) two registers (q14, q13) - MEND - ; -------------------------------------------------------------------------- - ; Load from output (used as temporary storage) - ; reg1 = output[first_offset] - ; reg2 = output[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and loading - add r1, #($first_offset - $prev_offset )*32*2 - vld1.s16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vld1.s16 {$reg2}, [r1] - ; (used) two registers ($reg1, $reg2) - MEND - ; -------------------------------------------------------------------------- - ; Store into output (sometimes as as temporary storage) - ; output[first_offset] = reg1 - ; output[second_offset] = reg2 - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and storing - add r1, #($first_offset - $prev_offset )*32*2 - vst1.16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vst1.16 {$reg2}, [r1] - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10] - vst1.16 {d11}, [r9] - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10]! - vst1.16 {d11}, [r9]! - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6] - vst1.16 {d4}, [r7] - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6]! - vst1.16 {d4}, [r7]! - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - ; TODO(cd): have special case to re-use constants when they are similar for - ; consecutive butterflies - ; TODO(cd): have special case when both constants are the same, do the - ; additions/subtractions before the multiplies. - ; generate the constants - ; generate scalar constants - mov r8, #$first_constant & 0xFF00 - mov r12, #$second_constant & 0xFF00 - add r8, #$first_constant & 0x00FF - add r12, #$second_constant & 0x00FF - ; generate vector constants - vdup.16 d30, r8 - vdup.16 d31, r12 - ; (used) two for inputs (regA-regD), one for constants (q15) - ; do some multiplications (ordered for maximum latency hiding) - vmull.s16 q8, $regC, d30 - vmull.s16 q10, $regA, d31 - vmull.s16 q9, $regD, d30 - vmull.s16 q11, $regB, d31 - vmull.s16 q12, $regC, d31 - ; (used) five for intermediate (q8-q12), one for constants (q15) - ; do some addition/subtractions (to get back two register) - vsub.s32 q8, q8, q10 - vsub.s32 q9, q9, q11 - ; do more multiplications (ordered for maximum latency hiding) - vmull.s16 q10, $regD, d31 - vmull.s16 q11, $regA, d30 - vmull.s16 q15, $regB, d30 - ; (used) six for intermediate (q8-q12, q15) - ; do more addition/subtractions - vadd.s32 q11, q12, q11 - vadd.s32 q10, q10, q15 - ; (used) four for intermediate (q8-q11) - ; dct_const_round_shift - vqrshrn.s32 $reg1, q8, #14 - vqrshrn.s32 $reg2, q9, #14 - vqrshrn.s32 $reg3, q11, #14 - vqrshrn.s32 $reg4, q10, #14 - ; (used) two for results, well four d registers - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - MEND - ; -------------------------------------------------------------------------- - -;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); -; -; r0 int16_t *input, -; r1 uint8_t *dest, -; r2 int dest_stride) -; loop counters -; r4 bands loop counter -; r5 pass loop counter -; r8 transpose loop counter -; combine-add pointers -; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) -; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) -; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) -; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) - -|aom_idct32x32_1024_add_neon| PROC - ; This function does one pass of idct32x32 transform. - ; - ; This is done by transposing the input and then doing a 1d transform on - ; columns. In the first pass, the transposed columns are the original - ; rows. In the second pass, after the transposition, the colums are the - ; original columns. - ; The 1d transform is done by looping over bands of eight columns (the - ; idct32_bands loop). For each band, the transform input transposition - ; is done on demand, one band of four 8x8 matrices at a time. The four - ; matrices are transposed by pairs (the idct32_transpose_pair loop). - push {r4-r11} - vpush {d8-d15} - ; stack operation - ; internal buffer used to transpose 8 lines into before transforming them - ; int16_t transpose_buffer[32 * 8]; - ; at sp + [4096, 4607] - ; results of the first pass (transpose and transform rows) - ; int16_t pass1[32 * 32]; - ; at sp + [0, 2047] - ; results of the second pass (transpose and transform columns) - ; int16_t pass2[32 * 32]; - ; at sp + [2048, 4095] - sub sp, sp, #512+2048+2048 - - ; r6 = dest + 31 * dest_stride - ; r7 = dest + 0 * dest_stride - ; r9 = dest + 15 * dest_stride - ; r10 = dest + 16 * dest_stride - rsb r6, r2, r2, lsl #5 - rsb r9, r2, r2, lsl #4 - add r10, r1, r2, lsl #4 - mov r7, r1 - add r6, r6, r1 - add r9, r9, r1 - ; r11 = -dest_stride - neg r11, r2 - ; r3 = input - mov r3, r0 - ; parameters for first pass - ; r0 = transpose_buffer[32 * 8] - add r0, sp, #4096 - ; r1 = pass1[32 * 32] - mov r1, sp - - mov r5, #0 ; initialize pass loop counter -idct32_pass_loop - mov r4, #4 ; initialize bands loop counter -idct32_bands_loop - mov r8, #2 ; initialize transpose loop counter -idct32_transpose_pair_loop - ; Load two horizontally consecutive 8x8 16bit data matrices. The first one - ; into q0-q7 and the second one into q8-q15. There is a stride of 64, - ; adjusted to 32 because of the two post-increments. - vld1.s16 {q8}, [r3]! - vld1.s16 {q0}, [r3]! - add r3, #32 - vld1.s16 {q9}, [r3]! - vld1.s16 {q1}, [r3]! - add r3, #32 - vld1.s16 {q10}, [r3]! - vld1.s16 {q2}, [r3]! - add r3, #32 - vld1.s16 {q11}, [r3]! - vld1.s16 {q3}, [r3]! - add r3, #32 - vld1.s16 {q12}, [r3]! - vld1.s16 {q4}, [r3]! - add r3, #32 - vld1.s16 {q13}, [r3]! - vld1.s16 {q5}, [r3]! - add r3, #32 - vld1.s16 {q14}, [r3]! - vld1.s16 {q6}, [r3]! - add r3, #32 - vld1.s16 {q15}, [r3]! - vld1.s16 {q7}, [r3]! - - ; Transpose the two 8x8 16bit data matrices. - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vswp d1, d8 - vswp d7, d14 - vswp d5, d12 - vswp d3, d10 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.32 q0, q2 - vtrn.32 q1, q3 - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; Store both matrices after each other. There is a stride of 32, which - ; adjusts to nothing because of the post-increments. - vst1.16 {q8}, [r0]! - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! - vst1.16 {q0}, [r0]! - vst1.16 {q1}, [r0]! - vst1.16 {q2}, [r0]! - vst1.16 {q3}, [r0]! - vst1.16 {q4}, [r0]! - vst1.16 {q5}, [r0]! - vst1.16 {q6}, [r0]! - vst1.16 {q7}, [r0]! - - ; increment pointers by adjusted stride (not necessary for r0/out) - ; go back by 7*32 for the seven lines moved fully by read and add - ; go back by 32 for the eigth line only read - ; advance by 16*2 to go the next pair - sub r3, r3, #7*32*2 + 32 - 16*2 - ; transpose pair loop processing - subs r8, r8, #1 - bne idct32_transpose_pair_loop - - ; restore r0/input to its original value - sub r0, r0, #32*8*2 - - ; Instead of doing the transforms stage by stage, it is done by loading - ; some input values and doing as many stages as possible to minimize the - ; storing/loading of intermediate results. To fit within registers, the - ; final coefficients are cut into four blocks: - ; BLOCK A: 16-19,28-31 - ; BLOCK B: 20-23,24-27 - ; BLOCK C: 8-10,11-15 - ; BLOCK D: 0-3,4-7 - ; Blocks A and C are straight calculation through the various stages. In - ; block B, further calculations are performed using the results from - ; block A. In block D, further calculations are performed using the results - ; from block C and then the final calculations are done using results from - ; block A and B which have been combined at the end of block B. - - ; -------------------------------------------------------------------------- - ; BLOCK A: 16-19,28-31 - ; -------------------------------------------------------------------------- - ; generate 16,17,30,31 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; - ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; - ;step1b[16][i] = dct_const_round_shift(temp1); - ;step1b[31][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 0, 1, 31 - DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; - ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; - ;step1b[17][i] = dct_const_round_shift(temp1); - ;step1b[30][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 31, 17, 15 - DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[16] = step1b[16][i] + step1b[17][i]; - ;step2[17] = step1b[16][i] - step1b[17][i]; - ;step2[30] = -step1b[30][i] + step1b[31][i]; - ;step2[31] = step1b[30][i] + step1b[31][i]; - vadd.s16 q4, q0, q1 - vsub.s16 q13, q0, q1 - vadd.s16 q6, q2, q3 - vsub.s16 q14, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; - ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; - ;step3[17] = dct_const_round_shift(temp1); - ;step3[30] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; generate 18,19,28,29 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; - ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; - ;step1b[18][i] = dct_const_round_shift(temp1); - ;step1b[29][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 15, 9, 23 - DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; - ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; - ;step1b[19][i] = dct_const_round_shift(temp1); - ;step1b[28][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 23, 25, 7 - DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[18] = -step1b[18][i] + step1b[19][i]; - ;step2[19] = step1b[18][i] + step1b[19][i]; - ;step2[28] = step1b[28][i] + step1b[29][i]; - ;step2[29] = step1b[28][i] - step1b[29][i]; - vsub.s16 q13, q3, q2 - vadd.s16 q3, q3, q2 - vsub.s16 q14, q1, q0 - vadd.s16 q2, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); - ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); - ;step3[29] = dct_const_round_shift(temp1); - ;step3[18] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 - ; -------------------------------------------------------------------------- - ; combine 16-19,28-31 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[16] = step1b[16][i] + step1b[19][i]; - ;step1[17] = step1b[17][i] + step1b[18][i]; - ;step1[18] = step1b[17][i] - step1b[18][i]; - ;step1[29] = step1b[30][i] - step1b[29][i]; - ;step1[30] = step1b[30][i] + step1b[29][i]; - ;step1[31] = step1b[31][i] + step1b[28][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q0 - vadd.s16 q10, q7, q1 - vadd.s16 q15, q6, q3 - vsub.s16 q13, q5, q0 - vsub.s16 q14, q7, q1 - STORE_IN_OUTPUT 0, 16, 31, q8, q15 - STORE_IN_OUTPUT 31, 17, 30, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; - ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; - ;step2[18] = dct_const_round_shift(temp1); - ;step2[29] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 - STORE_IN_OUTPUT 30, 29, 18, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[19] = step1b[16][i] - step1b[19][i]; - ;step1[28] = step1b[31][i] - step1b[28][i]; - vsub.s16 q13, q4, q2 - vsub.s16 q14, q6, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; - ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; - ;step2[19] = dct_const_round_shift(temp1); - ;step2[28] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 - STORE_IN_OUTPUT 18, 19, 28, q4, q6 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK B: 20-23,24-27 - ; -------------------------------------------------------------------------- - ; generate 20,21,26,27 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; - ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; - ;step1b[20][i] = dct_const_round_shift(temp1); - ;step1b[27][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 7, 5, 27 - DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; - ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; - ;step1b[21][i] = dct_const_round_shift(temp1); - ;step1b[26][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 27, 21, 11 - DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[20] = step1b[20][i] + step1b[21][i]; - ;step2[21] = step1b[20][i] - step1b[21][i]; - ;step2[26] = -step1b[26][i] + step1b[27][i]; - ;step2[27] = step1b[26][i] + step1b[27][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; - ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; - ;step3[21] = dct_const_round_shift(temp1); - ;step3[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 22,23,24,25 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; - ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; - ;step1b[22][i] = dct_const_round_shift(temp1); - ;step1b[25][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 11, 13, 19 - DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; - ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; - ;step1b[23][i] = dct_const_round_shift(temp1); - ;step1b[24][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 19, 29, 3 - DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[22] = -step1b[22][i] + step1b[23][i]; - ;step2[23] = step1b[22][i] + step1b[23][i]; - ;step2[24] = step1b[24][i] + step1b[25][i]; - ;step2[25] = step1b[24][i] - step1b[25][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); - ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); - ;step3[25] = dct_const_round_shift(temp1); - ;step3[22] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 20-23,24-27 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[22] = step1b[22][i] + step1b[21][i]; - ;step1[23] = step1b[23][i] + step1b[20][i]; - vadd.s16 q10, q7, q1 - vadd.s16 q11, q5, q0 - ;step1[24] = step1b[24][i] + step1b[27][i]; - ;step1[25] = step1b[25][i] + step1b[26][i]; - vadd.s16 q12, q6, q2 - vadd.s16 q15, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[16] = step1b[16][i] + step1b[23][i]; - ;step3[17] = step1b[17][i] + step1b[22][i]; - ;step3[22] = step1b[17][i] - step1b[22][i]; - ;step3[23] = step1b[16][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 - vadd.s16 q8, q14, q11 - vadd.s16 q9, q13, q10 - vsub.s16 q13, q13, q10 - vsub.s16 q11, q14, q11 - STORE_IN_OUTPUT 17, 17, 16, q9, q8 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[24] = step1b[31][i] - step1b[24][i]; - ;step3[25] = step1b[30][i] - step1b[25][i]; - ;step3[30] = step1b[30][i] + step1b[25][i]; - ;step3[31] = step1b[31][i] + step1b[24][i]; - LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 - vsub.s16 q8, q9, q12 - vadd.s16 q10, q14, q15 - vsub.s16 q14, q14, q15 - vadd.s16 q12, q9, q12 - STORE_IN_OUTPUT 31, 30, 31, q10, q12 - ; -------------------------------------------------------------------------- - ; TODO(cd) do some register allocation change to remove these push/pop - vpush {q8} ; [24] - vpush {q11} ; [23] - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; - ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; - ;step1[22] = dct_const_round_shift(temp1); - ;step1[25] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 31, 25, 22, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; - ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; - ;step1[23] = dct_const_round_shift(temp1); - ;step1[24] = dct_const_round_shift(temp2); - ; TODO(cd) do some register allocation change to remove these push/pop - vpop {q13} ; [23] - vpop {q14} ; [24] - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 22, 24, 23, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[20] = step1b[23][i] - step1b[20][i]; - ;step1[27] = step1b[24][i] - step1b[27][i]; - vsub.s16 q14, q5, q0 - vsub.s16 q13, q6, q2 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); - ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); - ;step2[27] = dct_const_round_shift(temp1); - ;step2[20] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[21] = step1b[22][i] - step1b[21][i]; - ;step1[26] = step1b[25][i] - step1b[26][i]; - vsub.s16 q14, q7, q1 - vsub.s16 q13, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); - ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); - ;step2[26] = dct_const_round_shift(temp1); - ;step2[21] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[18] = step1b[18][i] + step1b[21][i]; - ;step3[19] = step1b[19][i] + step1b[20][i]; - ;step3[20] = step1b[19][i] - step1b[20][i]; - ;step3[21] = step1b[18][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 - vadd.s16 q8, q14, q1 - vadd.s16 q9, q13, q6 - vsub.s16 q13, q13, q6 - vsub.s16 q1, q14, q1 - STORE_IN_OUTPUT 19, 18, 19, q8, q9 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[27] = step1b[28][i] - step1b[27][i]; - ;step3[28] = step1b[28][i] + step1b[27][i]; - ;step3[29] = step1b[29][i] + step1b[26][i]; - ;step3[26] = step1b[29][i] - step1b[26][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 - vsub.s16 q14, q8, q5 - vadd.s16 q10, q8, q5 - vadd.s16 q11, q9, q0 - vsub.s16 q0, q9, q0 - STORE_IN_OUTPUT 29, 28, 29, q10, q11 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; - ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; - ;step1[20] = dct_const_round_shift(temp1); - ;step1[27] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 29, 20, 27, q13, q14 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; - ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; - ;step1[21] = dct_const_round_shift(temp1); - ;step1[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 - STORE_IN_OUTPUT 27, 21, 26, q1, q0 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK C: 8-10,11-15 - ; -------------------------------------------------------------------------- - ; generate 8,9,14,15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; - ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; - ;step2[8] = dct_const_round_shift(temp1); - ;step2[15] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 3, 2, 30 - DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; - ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; - ;step2[9] = dct_const_round_shift(temp1); - ;step2[14] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 30, 18, 14 - DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[8] = step1b[8][i] + step1b[9][i]; - ;step3[9] = step1b[8][i] - step1b[9][i]; - ;step3[14] = step1b[15][i] - step1b[14][i]; - ;step3[15] = step1b[15][i] + step1b[14][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; - ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; - ;step1[9] = dct_const_round_shift(temp1); - ;step1[14] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 10,11,12,13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; - ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; - ;step2[10] = dct_const_round_shift(temp1); - ;step2[13] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 14, 10, 22 - DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; - ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; - ;step2[11] = dct_const_round_shift(temp1); - ;step2[12] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 22, 26, 6 - DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[10] = step1b[11][i] - step1b[10][i]; - ;step3[11] = step1b[11][i] + step1b[10][i]; - ;step3[12] = step1b[12][i] + step1b[13][i]; - ;step3[13] = step1b[12][i] - step1b[13][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); - ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); - ;step1[13] = dct_const_round_shift(temp1); - ;step1[10] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 8-10,11-15 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[8] = step1b[8][i] + step1b[11][i]; - ;step2[9] = step1b[9][i] + step1b[10][i]; - ;step2[10] = step1b[9][i] - step1b[10][i]; - vadd.s16 q8, q0, q5 - vadd.s16 q9, q1, q7 - vsub.s16 q13, q1, q7 - ;step2[13] = step1b[14][i] - step1b[13][i]; - ;step2[14] = step1b[14][i] + step1b[13][i]; - ;step2[15] = step1b[15][i] + step1b[12][i]; - vsub.s16 q14, q3, q4 - vadd.s16 q10, q3, q4 - vadd.s16 q15, q2, q6 - STORE_IN_OUTPUT 26, 8, 15, q8, q15 - STORE_IN_OUTPUT 15, 9, 14, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; - ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; - ;step3[10] = dct_const_round_shift(temp1); - ;step3[13] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 14, 13, 10, q3, q1 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[11] = step1b[8][i] - step1b[11][i]; - ;step2[12] = step1b[15][i] - step1b[12][i]; - vsub.s16 q13, q0, q5 - vsub.s16 q14, q2, q6 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; - ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; - ;step3[11] = dct_const_round_shift(temp1); - ;step3[12] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 10, 11, 12, q1, q3 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK D: 0-3,4-7 - ; -------------------------------------------------------------------------- - ; generate 4,5,6,7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; - ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; - ;step3[4] = dct_const_round_shift(temp1); - ;step3[7] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 6, 4, 28 - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; - ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; - ;step3[5] = dct_const_round_shift(temp1); - ;step3[6] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 28, 20, 12 - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[4] = step1b[4][i] + step1b[5][i]; - ;step1[5] = step1b[4][i] - step1b[5][i]; - ;step1[6] = step1b[7][i] - step1b[6][i]; - ;step1[7] = step1b[7][i] + step1b[6][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; - ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; - ;step2[5] = dct_const_round_shift(temp1); - ;step2[6] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 0,1,2,3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; - ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; - ;step1[1] = dct_const_round_shift(temp1); - ;step1[0] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 12, 0, 16 - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; - ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; - ;step1[2] = dct_const_round_shift(temp1); - ;step1[3] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 16, 8, 24 - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[0] = step1b[0][i] + step1b[3][i]; - ;step2[1] = step1b[1][i] + step1b[2][i]; - ;step2[2] = step1b[1][i] - step1b[2][i]; - ;step2[3] = step1b[0][i] - step1b[3][i]; - vadd.s16 q4, q7, q6 - vsub.s16 q7, q7, q6 - vsub.s16 q6, q5, q14 - vadd.s16 q5, q5, q14 - ; -------------------------------------------------------------------------- - ; combine 0-3,4-7 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[0] = step1b[0][i] + step1b[7][i]; - ;step3[1] = step1b[1][i] + step1b[6][i]; - ;step3[2] = step1b[2][i] + step1b[5][i]; - ;step3[3] = step1b[3][i] + step1b[4][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q3 - vadd.s16 q10, q6, q1 - vadd.s16 q11, q7, q0 - ;step3[4] = step1b[3][i] - step1b[4][i]; - ;step3[5] = step1b[2][i] - step1b[5][i]; - ;step3[6] = step1b[1][i] - step1b[6][i]; - ;step3[7] = step1b[0][i] - step1b[7][i]; - vsub.s16 q12, q7, q0 - vsub.s16 q13, q6, q1 - vsub.s16 q14, q5, q3 - vsub.s16 q15, q4, q2 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[0] = step1b[0][i] + step1b[15][i]; - ;step1[1] = step1b[1][i] + step1b[14][i]; - ;step1[14] = step1b[1][i] - step1b[14][i]; - ;step1[15] = step1b[0][i] - step1b[15][i]; - LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 - vadd.s16 q2, q8, q1 - vadd.s16 q3, q9, q0 - vsub.s16 q4, q9, q0 - vsub.s16 q5, q8, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[14 * 32] = step1b[14][i] + step1b[17][i]; - ;output[15 * 32] = step1b[15][i] + step1b[16][i]; - ;output[16 * 32] = step1b[15][i] - step1b[16][i]; - ;output[17 * 32] = step1b[14][i] - step1b[17][i]; - LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - - cmp r5, #0 - bgt idct32_bands_end_2nd_pass - -idct32_bands_end_1st_pass - STORE_IN_OUTPUT 17, 16, 17, q6, q7 - STORE_IN_OUTPUT 17, 14, 15, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 31, 30, 31, q6, q7 - STORE_IN_OUTPUT 31, 0, 1, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 19, 18, 19, q6, q7 - STORE_IN_OUTPUT 19, 12, 13, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 29, 28, 29, q6, q7 - STORE_IN_OUTPUT 29, 2, 3, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 21, 20, 21, q6, q7 - STORE_IN_OUTPUT 21, 10, 11, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 27, 26, 27, q6, q7 - STORE_IN_OUTPUT 27, 4, 5, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 23, 22, 23, q6, q7 - STORE_IN_OUTPUT 23, 8, 9, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 25, 24, 25, q6, q7 - STORE_IN_OUTPUT 25, 6, 7, q4, q5 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #7*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; parameters for second pass - ; the input of pass2 is the result of pass1. we have to remove the offset - ; of 32 columns induced by the above idct32_bands_loop - sub r3, r1, #32*2 - ; r1 = pass2[32 * 32] - add r1, sp, #2048 - - ; pass loop processing - add r5, r5, #1 - b idct32_pass_loop - -idct32_bands_end_2nd_pass - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; restore pointers to their initial indices for next band pass by - ; removing/adding dest_stride * 8. The actual increment by eight - ; is taken care of within the _LAST macros. - add r6, r6, r2, lsl #3 - add r9, r9, r2, lsl #3 - sub r7, r7, r2, lsl #3 - sub r10, r10, r2, lsl #3 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #25*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; stack operation - add sp, sp, #512+2048+2048 - vpop {d8-d15} - pop {r4-r11} - bx lr - ENDP ; |aom_idct32x32_1024_add_neon| - END diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm new file mode 100644 index 000000000..e7793fb16 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm @@ -0,0 +1,1302 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +;TODO(cd): adjust these constant to be able to use vqdmulh for faster +; dct_const_round_shift(a * b) within butterfly calculations. +cospi_1_64 EQU 16364 +cospi_2_64 EQU 16305 +cospi_3_64 EQU 16207 +cospi_4_64 EQU 16069 +cospi_5_64 EQU 15893 +cospi_6_64 EQU 15679 +cospi_7_64 EQU 15426 +cospi_8_64 EQU 15137 +cospi_9_64 EQU 14811 +cospi_10_64 EQU 14449 +cospi_11_64 EQU 14053 +cospi_12_64 EQU 13623 +cospi_13_64 EQU 13160 +cospi_14_64 EQU 12665 +cospi_15_64 EQU 12140 +cospi_16_64 EQU 11585 +cospi_17_64 EQU 11003 +cospi_18_64 EQU 10394 +cospi_19_64 EQU 9760 +cospi_20_64 EQU 9102 +cospi_21_64 EQU 8423 +cospi_22_64 EQU 7723 +cospi_23_64 EQU 7005 +cospi_24_64 EQU 6270 +cospi_25_64 EQU 5520 +cospi_26_64 EQU 4756 +cospi_27_64 EQU 3981 +cospi_28_64 EQU 3196 +cospi_29_64 EQU 2404 +cospi_30_64 EQU 1606 +cospi_31_64 EQU 804 + + + EXPORT |aom_idct32x32_1024_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + AREA Block, CODE, READONLY + + ; -------------------------------------------------------------------------- + ; Load from transposed_buffer + ; q13 = transposed_buffer[first_offset] + ; q14 = transposed_buffer[second_offset] + ; for proper address calculation, the last offset used when manipulating + ; transposed_buffer must be passed in. use 0 for first use. + MACRO + LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset + ; address calculation with proper stride and loading + add r0, #($first_offset - $prev_offset )*8*2 + vld1.s16 {q14}, [r0] + add r0, #($second_offset - $first_offset)*8*2 + vld1.s16 {q13}, [r0] + ; (used) two registers (q14, q13) + MEND + ; -------------------------------------------------------------------------- + ; Load from output (used as temporary storage) + ; reg1 = output[first_offset] + ; reg2 = output[second_offset] + ; for proper address calculation, the last offset used when manipulating + ; output, whether reading or storing) must be passed in. use 0 for first + ; use. + MACRO + LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 + ; address calculation with proper stride and loading + add r1, #($first_offset - $prev_offset )*32*2 + vld1.s16 {$reg1}, [r1] + add r1, #($second_offset - $first_offset)*32*2 + vld1.s16 {$reg2}, [r1] + ; (used) two registers ($reg1, $reg2) + MEND + ; -------------------------------------------------------------------------- + ; Store into output (sometimes as as temporary storage) + ; output[first_offset] = reg1 + ; output[second_offset] = reg2 + ; for proper address calculation, the last offset used when manipulating + ; output, whether reading or storing) must be passed in. use 0 for first + ; use. + MACRO + STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 + ; address calculation with proper stride and storing + add r1, #($first_offset - $prev_offset )*32*2 + vst1.16 {$reg1}, [r1] + add r1, #($second_offset - $first_offset)*32*2 + vst1.16 {$reg2}, [r1] + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10] + vst1.16 {d11}, [r9] + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q6-q9 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_CENTER_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d8}, [r10], r2 + vld1.s16 {d11}, [r9], r11 + vld1.s16 {d9}, [r10] + vld1.s16 {d10}, [r9] + ; ROUND_POWER_OF_TWO + vrshr.s16 q7, q7, #6 + vrshr.s16 q8, q8, #6 + vrshr.s16 q9, q9, #6 + vrshr.s16 q6, q6, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q7, q7, d9 + vaddw.u8 q8, q8, d10 + vaddw.u8 q9, q9, d11 + vaddw.u8 q6, q6, d8 + ; clip pixel + vqmovun.s16 d9, q7 + vqmovun.s16 d10, q8 + vqmovun.s16 d11, q9 + vqmovun.s16 d8, q6 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d9}, [r10], r11 + vst1.16 {d10}, [r9], r2 + vst1.16 {d8}, [r10]! + vst1.16 {d11}, [r9]! + ; update pointers (by dest_stride * 2) + sub r9, r9, r2, lsl #1 + add r10, r10, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6] + vst1.16 {d4}, [r7] + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Combine-add results with current destination content + ; q4-q7 contain the results (out[j * 32 + 0-31]) + MACRO + STORE_COMBINE_EXTREME_RESULTS_LAST + ; load dest[j * dest_stride + 0-31] + vld1.s16 {d4}, [r7], r2 + vld1.s16 {d7}, [r6], r11 + vld1.s16 {d5}, [r7] + vld1.s16 {d6}, [r6] + ; ROUND_POWER_OF_TWO + vrshr.s16 q5, q5, #6 + vrshr.s16 q6, q6, #6 + vrshr.s16 q7, q7, #6 + vrshr.s16 q4, q4, #6 + ; add to dest[j * dest_stride + 0-31] + vaddw.u8 q5, q5, d5 + vaddw.u8 q6, q6, d6 + vaddw.u8 q7, q7, d7 + vaddw.u8 q4, q4, d4 + ; clip pixel + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + vqmovun.s16 d4, q4 + ; store back into dest[j * dest_stride + 0-31] + vst1.16 {d5}, [r7], r11 + vst1.16 {d6}, [r6], r2 + vst1.16 {d7}, [r6]! + vst1.16 {d4}, [r7]! + ; update pointers (by dest_stride * 2) + sub r6, r6, r2, lsl #1 + add r7, r7, r2, lsl #1 + MEND + ; -------------------------------------------------------------------------- + ; Touches q8-q12, q15 (q13-q14 are preserved) + ; valid output registers are anything but q8-q11 + MACRO + DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + ; TODO(cd): have special case to re-use constants when they are similar for + ; consecutive butterflies + ; TODO(cd): have special case when both constants are the same, do the + ; additions/subtractions before the multiplies. + ; generate the constants + ; generate scalar constants + mov r8, #$first_constant & 0xFF00 + mov r12, #$second_constant & 0xFF00 + add r8, #$first_constant & 0x00FF + add r12, #$second_constant & 0x00FF + ; generate vector constants + vdup.16 d30, r8 + vdup.16 d31, r12 + ; (used) two for inputs (regA-regD), one for constants (q15) + ; do some multiplications (ordered for maximum latency hiding) + vmull.s16 q8, $regC, d30 + vmull.s16 q10, $regA, d31 + vmull.s16 q9, $regD, d30 + vmull.s16 q11, $regB, d31 + vmull.s16 q12, $regC, d31 + ; (used) five for intermediate (q8-q12), one for constants (q15) + ; do some addition/subtractions (to get back two register) + vsub.s32 q8, q8, q10 + vsub.s32 q9, q9, q11 + ; do more multiplications (ordered for maximum latency hiding) + vmull.s16 q10, $regD, d31 + vmull.s16 q11, $regA, d30 + vmull.s16 q15, $regB, d30 + ; (used) six for intermediate (q8-q12, q15) + ; do more addition/subtractions + vadd.s32 q11, q12, q11 + vadd.s32 q10, q10, q15 + ; (used) four for intermediate (q8-q11) + ; dct_const_round_shift + vqrshrn.s32 $reg1, q8, #14 + vqrshrn.s32 $reg2, q9, #14 + vqrshrn.s32 $reg3, q11, #14 + vqrshrn.s32 $reg4, q10, #14 + ; (used) two for results, well four d registers + MEND + ; -------------------------------------------------------------------------- + ; Touches q8-q12, q15 (q13-q14 are preserved) + ; valid output registers are anything but q8-q11 + MACRO + DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 + MEND + ; -------------------------------------------------------------------------- + +;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); +; +; r0 int16_t *input, +; r1 uint8_t *dest, +; r2 int dest_stride) +; loop counters +; r4 bands loop counter +; r5 pass loop counter +; r8 transpose loop counter +; combine-add pointers +; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) +; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) +; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) +; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) + +|aom_idct32x32_1024_add_neon| PROC + ; This function does one pass of idct32x32 transform. + ; + ; This is done by transposing the input and then doing a 1d transform on + ; columns. In the first pass, the transposed columns are the original + ; rows. In the second pass, after the transposition, the colums are the + ; original columns. + ; The 1d transform is done by looping over bands of eight columns (the + ; idct32_bands loop). For each band, the transform input transposition + ; is done on demand, one band of four 8x8 matrices at a time. The four + ; matrices are transposed by pairs (the idct32_transpose_pair loop). + push {r4-r11} + vpush {d8-d15} + ; stack operation + ; internal buffer used to transpose 8 lines into before transforming them + ; int16_t transpose_buffer[32 * 8]; + ; at sp + [4096, 4607] + ; results of the first pass (transpose and transform rows) + ; int16_t pass1[32 * 32]; + ; at sp + [0, 2047] + ; results of the second pass (transpose and transform columns) + ; int16_t pass2[32 * 32]; + ; at sp + [2048, 4095] + sub sp, sp, #512+2048+2048 + + ; r6 = dest + 31 * dest_stride + ; r7 = dest + 0 * dest_stride + ; r9 = dest + 15 * dest_stride + ; r10 = dest + 16 * dest_stride + rsb r6, r2, r2, lsl #5 + rsb r9, r2, r2, lsl #4 + add r10, r1, r2, lsl #4 + mov r7, r1 + add r6, r6, r1 + add r9, r9, r1 + ; r11 = -dest_stride + neg r11, r2 + ; r3 = input + mov r3, r0 + ; parameters for first pass + ; r0 = transpose_buffer[32 * 8] + add r0, sp, #4096 + ; r1 = pass1[32 * 32] + mov r1, sp + + mov r5, #0 ; initialize pass loop counter +idct32_pass_loop + mov r4, #4 ; initialize bands loop counter +idct32_bands_loop + mov r8, #2 ; initialize transpose loop counter +idct32_transpose_pair_loop + ; Load two horizontally consecutive 8x8 16bit data matrices. The first one + ; into q0-q7 and the second one into q8-q15. There is a stride of 64, + ; adjusted to 32 because of the two post-increments. + vld1.s16 {q8}, [r3]! + vld1.s16 {q0}, [r3]! + add r3, #32 + vld1.s16 {q9}, [r3]! + vld1.s16 {q1}, [r3]! + add r3, #32 + vld1.s16 {q10}, [r3]! + vld1.s16 {q2}, [r3]! + add r3, #32 + vld1.s16 {q11}, [r3]! + vld1.s16 {q3}, [r3]! + add r3, #32 + vld1.s16 {q12}, [r3]! + vld1.s16 {q4}, [r3]! + add r3, #32 + vld1.s16 {q13}, [r3]! + vld1.s16 {q5}, [r3]! + add r3, #32 + vld1.s16 {q14}, [r3]! + vld1.s16 {q6}, [r3]! + add r3, #32 + vld1.s16 {q15}, [r3]! + vld1.s16 {q7}, [r3]! + + ; Transpose the two 8x8 16bit data matrices. + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vswp d1, d8 + vswp d7, d14 + vswp d5, d12 + vswp d3, d10 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + ; Store both matrices after each other. There is a stride of 32, which + ; adjusts to nothing because of the post-increments. + vst1.16 {q8}, [r0]! + vst1.16 {q9}, [r0]! + vst1.16 {q10}, [r0]! + vst1.16 {q11}, [r0]! + vst1.16 {q12}, [r0]! + vst1.16 {q13}, [r0]! + vst1.16 {q14}, [r0]! + vst1.16 {q15}, [r0]! + vst1.16 {q0}, [r0]! + vst1.16 {q1}, [r0]! + vst1.16 {q2}, [r0]! + vst1.16 {q3}, [r0]! + vst1.16 {q4}, [r0]! + vst1.16 {q5}, [r0]! + vst1.16 {q6}, [r0]! + vst1.16 {q7}, [r0]! + + ; increment pointers by adjusted stride (not necessary for r0/out) + ; go back by 7*32 for the seven lines moved fully by read and add + ; go back by 32 for the eigth line only read + ; advance by 16*2 to go the next pair + sub r3, r3, #7*32*2 + 32 - 16*2 + ; transpose pair loop processing + subs r8, r8, #1 + bne idct32_transpose_pair_loop + + ; restore r0/input to its original value + sub r0, r0, #32*8*2 + + ; Instead of doing the transforms stage by stage, it is done by loading + ; some input values and doing as many stages as possible to minimize the + ; storing/loading of intermediate results. To fit within registers, the + ; final coefficients are cut into four blocks: + ; BLOCK A: 16-19,28-31 + ; BLOCK B: 20-23,24-27 + ; BLOCK C: 8-10,11-15 + ; BLOCK D: 0-3,4-7 + ; Blocks A and C are straight calculation through the various stages. In + ; block B, further calculations are performed using the results from + ; block A. In block D, further calculations are performed using the results + ; from block C and then the final calculations are done using results from + ; block A and B which have been combined at the end of block B. + + ; -------------------------------------------------------------------------- + ; BLOCK A: 16-19,28-31 + ; -------------------------------------------------------------------------- + ; generate 16,17,30,31 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; + ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; + ;step1b[16][i] = dct_const_round_shift(temp1); + ;step1b[31][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 0, 1, 31 + DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; + ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; + ;step1b[17][i] = dct_const_round_shift(temp1); + ;step1b[30][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 31, 17, 15 + DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[16] = step1b[16][i] + step1b[17][i]; + ;step2[17] = step1b[16][i] - step1b[17][i]; + ;step2[30] = -step1b[30][i] + step1b[31][i]; + ;step2[31] = step1b[30][i] + step1b[31][i]; + vadd.s16 q4, q0, q1 + vsub.s16 q13, q0, q1 + vadd.s16 q6, q2, q3 + vsub.s16 q14, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; + ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; + ;step3[17] = dct_const_round_shift(temp1); + ;step3[30] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; generate 18,19,28,29 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; + ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; + ;step1b[18][i] = dct_const_round_shift(temp1); + ;step1b[29][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 15, 9, 23 + DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; + ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; + ;step1b[19][i] = dct_const_round_shift(temp1); + ;step1b[28][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 23, 25, 7 + DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[18] = -step1b[18][i] + step1b[19][i]; + ;step2[19] = step1b[18][i] + step1b[19][i]; + ;step2[28] = step1b[28][i] + step1b[29][i]; + ;step2[29] = step1b[28][i] - step1b[29][i]; + vsub.s16 q13, q3, q2 + vadd.s16 q3, q3, q2 + vsub.s16 q14, q1, q0 + vadd.s16 q2, q1, q0 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); + ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); + ;step3[29] = dct_const_round_shift(temp1); + ;step3[18] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 + ; -------------------------------------------------------------------------- + ; combine 16-19,28-31 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[16] = step1b[16][i] + step1b[19][i]; + ;step1[17] = step1b[17][i] + step1b[18][i]; + ;step1[18] = step1b[17][i] - step1b[18][i]; + ;step1[29] = step1b[30][i] - step1b[29][i]; + ;step1[30] = step1b[30][i] + step1b[29][i]; + ;step1[31] = step1b[31][i] + step1b[28][i]; + vadd.s16 q8, q4, q2 + vadd.s16 q9, q5, q0 + vadd.s16 q10, q7, q1 + vadd.s16 q15, q6, q3 + vsub.s16 q13, q5, q0 + vsub.s16 q14, q7, q1 + STORE_IN_OUTPUT 0, 16, 31, q8, q15 + STORE_IN_OUTPUT 31, 17, 30, q9, q10 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; + ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; + ;step2[18] = dct_const_round_shift(temp1); + ;step2[29] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 + STORE_IN_OUTPUT 30, 29, 18, q1, q0 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[19] = step1b[16][i] - step1b[19][i]; + ;step1[28] = step1b[31][i] - step1b[28][i]; + vsub.s16 q13, q4, q2 + vsub.s16 q14, q6, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; + ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; + ;step2[19] = dct_const_round_shift(temp1); + ;step2[28] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 + STORE_IN_OUTPUT 18, 19, 28, q4, q6 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK B: 20-23,24-27 + ; -------------------------------------------------------------------------- + ; generate 20,21,26,27 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; + ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; + ;step1b[20][i] = dct_const_round_shift(temp1); + ;step1b[27][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 7, 5, 27 + DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; + ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; + ;step1b[21][i] = dct_const_round_shift(temp1); + ;step1b[26][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 27, 21, 11 + DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[20] = step1b[20][i] + step1b[21][i]; + ;step2[21] = step1b[20][i] - step1b[21][i]; + ;step2[26] = -step1b[26][i] + step1b[27][i]; + ;step2[27] = step1b[26][i] + step1b[27][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; + ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; + ;step3[21] = dct_const_round_shift(temp1); + ;step3[26] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 22,23,24,25 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; + ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; + ;step1b[22][i] = dct_const_round_shift(temp1); + ;step1b[25][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 11, 13, 19 + DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 1 + ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; + ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; + ;step1b[23][i] = dct_const_round_shift(temp1); + ;step1b[24][i] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 19, 29, 3 + DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;step2[22] = -step1b[22][i] + step1b[23][i]; + ;step2[23] = step1b[22][i] + step1b[23][i]; + ;step2[24] = step1b[24][i] + step1b[25][i]; + ;step2[25] = step1b[24][i] - step1b[25][i]; + vsub.s16 q14, q4, q5 + vadd.s16 q5, q4, q5 + vsub.s16 q13, q6, q7 + vadd.s16 q6, q6, q7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); + ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); + ;step3[25] = dct_const_round_shift(temp1); + ;step3[22] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 + ; -------------------------------------------------------------------------- + ; combine 20-23,24-27 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[22] = step1b[22][i] + step1b[21][i]; + ;step1[23] = step1b[23][i] + step1b[20][i]; + vadd.s16 q10, q7, q1 + vadd.s16 q11, q5, q0 + ;step1[24] = step1b[24][i] + step1b[27][i]; + ;step1[25] = step1b[25][i] + step1b[26][i]; + vadd.s16 q12, q6, q2 + vadd.s16 q15, q4, q3 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[16] = step1b[16][i] + step1b[23][i]; + ;step3[17] = step1b[17][i] + step1b[22][i]; + ;step3[22] = step1b[17][i] - step1b[22][i]; + ;step3[23] = step1b[16][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 + vadd.s16 q8, q14, q11 + vadd.s16 q9, q13, q10 + vsub.s16 q13, q13, q10 + vsub.s16 q11, q14, q11 + STORE_IN_OUTPUT 17, 17, 16, q9, q8 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[24] = step1b[31][i] - step1b[24][i]; + ;step3[25] = step1b[30][i] - step1b[25][i]; + ;step3[30] = step1b[30][i] + step1b[25][i]; + ;step3[31] = step1b[31][i] + step1b[24][i]; + LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 + vsub.s16 q8, q9, q12 + vadd.s16 q10, q14, q15 + vsub.s16 q14, q14, q15 + vadd.s16 q12, q9, q12 + STORE_IN_OUTPUT 31, 30, 31, q10, q12 + ; -------------------------------------------------------------------------- + ; TODO(cd) do some register allocation change to remove these push/pop + vpush {q8} ; [24] + vpush {q11} ; [23] + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; + ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; + ;step1[22] = dct_const_round_shift(temp1); + ;step1[25] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 31, 25, 22, q14, q13 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; + ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; + ;step1[23] = dct_const_round_shift(temp1); + ;step1[24] = dct_const_round_shift(temp2); + ; TODO(cd) do some register allocation change to remove these push/pop + vpop {q13} ; [23] + vpop {q14} ; [24] + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 22, 24, 23, q14, q13 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[20] = step1b[23][i] - step1b[20][i]; + ;step1[27] = step1b[24][i] - step1b[27][i]; + vsub.s16 q14, q5, q0 + vsub.s16 q13, q6, q2 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); + ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); + ;step2[27] = dct_const_round_shift(temp1); + ;step2[20] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[21] = step1b[22][i] - step1b[21][i]; + ;step1[26] = step1b[25][i] - step1b[26][i]; + vsub.s16 q14, q7, q1 + vsub.s16 q13, q4, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); + ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); + ;step2[26] = dct_const_round_shift(temp1); + ;step2[21] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[18] = step1b[18][i] + step1b[21][i]; + ;step3[19] = step1b[19][i] + step1b[20][i]; + ;step3[20] = step1b[19][i] - step1b[20][i]; + ;step3[21] = step1b[18][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 + vadd.s16 q8, q14, q1 + vadd.s16 q9, q13, q6 + vsub.s16 q13, q13, q6 + vsub.s16 q1, q14, q1 + STORE_IN_OUTPUT 19, 18, 19, q8, q9 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[27] = step1b[28][i] - step1b[27][i]; + ;step3[28] = step1b[28][i] + step1b[27][i]; + ;step3[29] = step1b[29][i] + step1b[26][i]; + ;step3[26] = step1b[29][i] - step1b[26][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 + vsub.s16 q14, q8, q5 + vadd.s16 q10, q8, q5 + vadd.s16 q11, q9, q0 + vsub.s16 q0, q9, q0 + STORE_IN_OUTPUT 29, 28, 29, q10, q11 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; + ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; + ;step1[20] = dct_const_round_shift(temp1); + ;step1[27] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 + STORE_IN_OUTPUT 29, 20, 27, q13, q14 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; + ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; + ;step1[21] = dct_const_round_shift(temp1); + ;step1[26] = dct_const_round_shift(temp2); + DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 + STORE_IN_OUTPUT 27, 21, 26, q1, q0 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK C: 8-10,11-15 + ; -------------------------------------------------------------------------- + ; generate 8,9,14,15 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; + ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; + ;step2[8] = dct_const_round_shift(temp1); + ;step2[15] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 3, 2, 30 + DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; + ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; + ;step2[9] = dct_const_round_shift(temp1); + ;step2[14] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 30, 18, 14 + DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;step3[8] = step1b[8][i] + step1b[9][i]; + ;step3[9] = step1b[8][i] - step1b[9][i]; + ;step3[14] = step1b[15][i] - step1b[14][i]; + ;step3[15] = step1b[15][i] + step1b[14][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; + ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; + ;step1[9] = dct_const_round_shift(temp1); + ;step1[14] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 10,11,12,13 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; + ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; + ;step2[10] = dct_const_round_shift(temp1); + ;step2[13] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 14, 10, 22 + DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 2 + ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; + ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; + ;step2[11] = dct_const_round_shift(temp1); + ;step2[12] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 22, 26, 6 + DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;step3[10] = step1b[11][i] - step1b[10][i]; + ;step3[11] = step1b[11][i] + step1b[10][i]; + ;step3[12] = step1b[12][i] + step1b[13][i]; + ;step3[13] = step1b[12][i] - step1b[13][i]; + vsub.s16 q14, q4, q5 + vadd.s16 q5, q4, q5 + vsub.s16 q13, q6, q7 + vadd.s16 q6, q6, q7 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); + ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); + ;step1[13] = dct_const_round_shift(temp1); + ;step1[10] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 + ; -------------------------------------------------------------------------- + ; combine 8-10,11-15 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[8] = step1b[8][i] + step1b[11][i]; + ;step2[9] = step1b[9][i] + step1b[10][i]; + ;step2[10] = step1b[9][i] - step1b[10][i]; + vadd.s16 q8, q0, q5 + vadd.s16 q9, q1, q7 + vsub.s16 q13, q1, q7 + ;step2[13] = step1b[14][i] - step1b[13][i]; + ;step2[14] = step1b[14][i] + step1b[13][i]; + ;step2[15] = step1b[15][i] + step1b[12][i]; + vsub.s16 q14, q3, q4 + vadd.s16 q10, q3, q4 + vadd.s16 q15, q2, q6 + STORE_IN_OUTPUT 26, 8, 15, q8, q15 + STORE_IN_OUTPUT 15, 9, 14, q9, q10 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; + ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; + ;step3[10] = dct_const_round_shift(temp1); + ;step3[13] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + STORE_IN_OUTPUT 14, 13, 10, q3, q1 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[11] = step1b[8][i] - step1b[11][i]; + ;step2[12] = step1b[15][i] - step1b[12][i]; + vsub.s16 q13, q0, q5 + vsub.s16 q14, q2, q6 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; + ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; + ;step3[11] = dct_const_round_shift(temp1); + ;step3[12] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + STORE_IN_OUTPUT 10, 11, 12, q1, q3 + ; -------------------------------------------------------------------------- + + + ; -------------------------------------------------------------------------- + ; BLOCK D: 0-3,4-7 + ; -------------------------------------------------------------------------- + ; generate 4,5,6,7 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; + ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; + ;step3[4] = dct_const_round_shift(temp1); + ;step3[7] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 6, 4, 28 + DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 + ; -------------------------------------------------------------------------- + ; part of stage 3 + ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; + ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; + ;step3[5] = dct_const_round_shift(temp1); + ;step3[6] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 28, 20, 12 + DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;step1[4] = step1b[4][i] + step1b[5][i]; + ;step1[5] = step1b[4][i] - step1b[5][i]; + ;step1[6] = step1b[7][i] - step1b[6][i]; + ;step1[7] = step1b[7][i] + step1b[6][i]; + vsub.s16 q13, q0, q1 + vadd.s16 q0, q0, q1 + vsub.s16 q14, q2, q3 + vadd.s16 q2, q2, q3 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; + ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; + ;step2[5] = dct_const_round_shift(temp1); + ;step2[6] = dct_const_round_shift(temp2); + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 + ; -------------------------------------------------------------------------- + ; generate 0,1,2,3 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; + ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; + ;step1[1] = dct_const_round_shift(temp1); + ;step1[0] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 12, 0, 16 + DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 + ; -------------------------------------------------------------------------- + ; part of stage 4 + ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; + ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; + ;step1[2] = dct_const_round_shift(temp1); + ;step1[3] = dct_const_round_shift(temp2); + LOAD_FROM_TRANSPOSED 16, 8, 24 + DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 + ; -------------------------------------------------------------------------- + ; part of stage 5 + ;step2[0] = step1b[0][i] + step1b[3][i]; + ;step2[1] = step1b[1][i] + step1b[2][i]; + ;step2[2] = step1b[1][i] - step1b[2][i]; + ;step2[3] = step1b[0][i] - step1b[3][i]; + vadd.s16 q4, q7, q6 + vsub.s16 q7, q7, q6 + vsub.s16 q6, q5, q14 + vadd.s16 q5, q5, q14 + ; -------------------------------------------------------------------------- + ; combine 0-3,4-7 + ; -------------------------------------------------------------------------- + ; part of stage 6 + ;step3[0] = step1b[0][i] + step1b[7][i]; + ;step3[1] = step1b[1][i] + step1b[6][i]; + ;step3[2] = step1b[2][i] + step1b[5][i]; + ;step3[3] = step1b[3][i] + step1b[4][i]; + vadd.s16 q8, q4, q2 + vadd.s16 q9, q5, q3 + vadd.s16 q10, q6, q1 + vadd.s16 q11, q7, q0 + ;step3[4] = step1b[3][i] - step1b[4][i]; + ;step3[5] = step1b[2][i] - step1b[5][i]; + ;step3[6] = step1b[1][i] - step1b[6][i]; + ;step3[7] = step1b[0][i] - step1b[7][i]; + vsub.s16 q12, q7, q0 + vsub.s16 q13, q6, q1 + vsub.s16 q14, q5, q3 + vsub.s16 q15, q4, q2 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[0] = step1b[0][i] + step1b[15][i]; + ;step1[1] = step1b[1][i] + step1b[14][i]; + ;step1[14] = step1b[1][i] - step1b[14][i]; + ;step1[15] = step1b[0][i] - step1b[15][i]; + LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 + vadd.s16 q2, q8, q1 + vadd.s16 q3, q9, q0 + vsub.s16 q4, q9, q0 + vsub.s16 q5, q8, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[14 * 32] = step1b[14][i] + step1b[17][i]; + ;output[15 * 32] = step1b[15][i] + step1b[16][i]; + ;output[16 * 32] = step1b[15][i] - step1b[16][i]; + ;output[17 * 32] = step1b[14][i] - step1b[17][i]; + LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + + cmp r5, #0 + bgt idct32_bands_end_2nd_pass + +idct32_bands_end_1st_pass + STORE_IN_OUTPUT 17, 16, 17, q6, q7 + STORE_IN_OUTPUT 17, 14, 15, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 31, 30, 31, q6, q7 + STORE_IN_OUTPUT 31, 0, 1, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 19, 18, 19, q6, q7 + STORE_IN_OUTPUT 19, 12, 13, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 29, 28, 29, q6, q7 + STORE_IN_OUTPUT 29, 2, 3, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 21, 20, 21, q6, q7 + STORE_IN_OUTPUT 21, 10, 11, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 27, 26, 27, q6, q7 + STORE_IN_OUTPUT 27, 4, 5, q4, q5 + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_IN_OUTPUT 23, 22, 23, q6, q7 + STORE_IN_OUTPUT 23, 8, 9, q8, q9 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_IN_OUTPUT 25, 24, 25, q6, q7 + STORE_IN_OUTPUT 25, 6, 7, q4, q5 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #7*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; parameters for second pass + ; the input of pass2 is the result of pass1. we have to remove the offset + ; of 32 columns induced by the above idct32_bands_loop + sub r3, r1, #32*2 + ; r1 = pass2[32 * 32] + add r1, sp, #2048 + + ; pass loop processing + add r5, r5, #1 + b idct32_pass_loop + +idct32_bands_end_2nd_pass + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; + ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; + ;output[30 * 32] = step1b[1][i] - step1b[30][i]; + ;output[31 * 32] = step1b[0][i] - step1b[31][i]; + LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[2] = step1b[2][i] + step1b[13][i]; + ;step1[3] = step1b[3][i] + step1b[12][i]; + ;step1[12] = step1b[3][i] - step1b[12][i]; + ;step1[13] = step1b[2][i] - step1b[13][i]; + LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 + vadd.s16 q2, q10, q1 + vadd.s16 q3, q11, q0 + vsub.s16 q4, q11, q0 + vsub.s16 q5, q10, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[12 * 32] = step1b[12][i] + step1b[19][i]; + ;output[13 * 32] = step1b[13][i] + step1b[18][i]; + ;output[18 * 32] = step1b[13][i] - step1b[18][i]; + ;output[19 * 32] = step1b[12][i] - step1b[19][i]; + LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; + ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; + ;output[28 * 32] = step1b[3][i] - step1b[28][i]; + ;output[29 * 32] = step1b[2][i] - step1b[29][i]; + LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[4] = step1b[4][i] + step1b[11][i]; + ;step1[5] = step1b[5][i] + step1b[10][i]; + ;step1[10] = step1b[5][i] - step1b[10][i]; + ;step1[11] = step1b[4][i] - step1b[11][i]; + LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 + vadd.s16 q2, q12, q1 + vadd.s16 q3, q13, q0 + vsub.s16 q4, q13, q0 + vsub.s16 q5, q12, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[10 * 32] = step1b[10][i] + step1b[21][i]; + ;output[11 * 32] = step1b[11][i] + step1b[20][i]; + ;output[20 * 32] = step1b[11][i] - step1b[20][i]; + ;output[21 * 32] = step1b[10][i] - step1b[21][i]; + LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; + ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; + ;output[26 * 32] = step1b[5][i] - step1b[26][i]; + ;output[27 * 32] = step1b[4][i] - step1b[27][i]; + LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS + ; -------------------------------------------------------------------------- + ; part of stage 7 + ;step1[6] = step1b[6][i] + step1b[9][i]; + ;step1[7] = step1b[7][i] + step1b[8][i]; + ;step1[8] = step1b[7][i] - step1b[8][i]; + ;step1[9] = step1b[6][i] - step1b[9][i]; + LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 + vadd.s16 q2, q14, q1 + vadd.s16 q3, q15, q0 + vsub.s16 q4, q15, q0 + vsub.s16 q5, q14, q1 + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; + ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; + ;output[22 * 32] = step1b[9][i] - step1b[22][i]; + ;output[23 * 32] = step1b[8][i] - step1b[23][i]; + LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 + vadd.s16 q8, q4, q1 + vadd.s16 q9, q5, q0 + vsub.s16 q6, q5, q0 + vsub.s16 q7, q4, q1 + STORE_COMBINE_CENTER_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; part of final stage + ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; + ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; + ;output[24 * 32] = step1b[7][i] - step1b[24][i]; + ;output[25 * 32] = step1b[6][i] - step1b[25][i]; + LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 + vadd.s16 q4, q2, q1 + vadd.s16 q5, q3, q0 + vsub.s16 q6, q3, q0 + vsub.s16 q7, q2, q1 + STORE_COMBINE_EXTREME_RESULTS_LAST + ; -------------------------------------------------------------------------- + ; restore pointers to their initial indices for next band pass by + ; removing/adding dest_stride * 8. The actual increment by eight + ; is taken care of within the _LAST macros. + add r6, r6, r2, lsl #3 + add r9, r9, r2, lsl #3 + sub r7, r7, r2, lsl #3 + sub r10, r10, r2, lsl #3 + + ; restore r0 by removing the last offset from the last + ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 + sub r0, r0, #24*8*2 + ; restore r1 by removing the last offset from the last + ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 + ; advance by 8 columns => 8*2 + sub r1, r1, #25*32*2 - 8*2 + ; advance by 8 lines (8*32*2) + ; go back by the two pairs from the loop (32*2) + add r3, r3, #8*32*2 - 32*2 + + ; bands loop processing + subs r4, r4, #1 + bne idct32_bands_loop + + ; stack operation + add sp, sp, #512+2048+2048 + vpop {d8-d15} + pop {r4-r11} + bx lr + ENDP ; |aom_idct32x32_1024_add_neon| + END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm deleted file mode 100644 index 6bd733d5d..000000000 --- a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.asm +++ /dev/null @@ -1,71 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - - EXPORT |aom_idct4x4_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct4x4_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 4) - add r0, r0, #8 ; + (1 <<((4) - 1)) - asr r0, r0, #4 ; >> 4 - - vdup.s16 q0, r0 ; duplicate a1 - - vld1.32 {d2[0]}, [r1], r2 - vld1.32 {d2[1]}, [r1], r2 - vld1.32 {d4[0]}, [r1], r2 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q8, q0, d2 ; dest[x] + a1 - vaddw.u8 q9, q0, d4 - - vqmovun.s16 d6, q8 ; clip_pixel - vqmovun.s16 d7, q9 - - vst1.32 {d6[0]}, [r12], r2 - vst1.32 {d6[1]}, [r12], r2 - vst1.32 {d7[0]}, [r12], r2 - vst1.32 {d7[1]}, [r12] - - bx lr - ENDP ; |aom_idct4x4_1_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm new file mode 100644 index 000000000..6bd733d5d --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm @@ -0,0 +1,71 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + + EXPORT |aom_idct4x4_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct4x4_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 4) + add r0, r0, #8 ; + (1 <<((4) - 1)) + asr r0, r0, #4 ; >> 4 + + vdup.s16 q0, r0 ; duplicate a1 + + vld1.32 {d2[0]}, [r1], r2 + vld1.32 {d2[1]}, [r1], r2 + vld1.32 {d4[0]}, [r1], r2 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q8, q0, d2 ; dest[x] + a1 + vaddw.u8 q9, q0, d4 + + vqmovun.s16 d6, q8 ; clip_pixel + vqmovun.s16 d7, q9 + + vst1.32 {d6[0]}, [r12], r2 + vst1.32 {d6[1]}, [r12], r2 + vst1.32 {d7[0]}, [r12], r2 + vst1.32 {d7[1]}, [r12] + + bx lr + ENDP ; |aom_idct4x4_1_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm deleted file mode 100644 index 127acf614..000000000 --- a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.asm +++ /dev/null @@ -1,193 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_idct4x4_16_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - AREA Block, CODE, READONLY ; name this block of code -;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct4x4_16_add_neon| PROC - - ; The 2D transform is done with two passes which are actually pretty - ; similar. We first transform the rows. This is done by transposing - ; the inputs, doing an SIMD column transform (the columns are the - ; transposed rows) and then transpose the results (so that it goes back - ; in normal/row positions). Then, we transform the columns by doing - ; another SIMD column transform. - ; So, two passes of a transpose followed by a column transform. - - ; load the inputs into q8-q9, d16-d19 - vld1.s16 {q8,q9}, [r0]! - - ; generate scalar constants - ; cospi_8_64 = 15137 = 0x3b21 - mov r0, #0x3b00 - add r0, #0x21 - ; cospi_16_64 = 11585 = 0x2d41 - mov r3, #0x2d00 - add r3, #0x41 - ; cospi_24_64 = 6270 = 0x 187e - mov r12, #0x1800 - add r12, #0x7e - - ; transpose the input data - ; 00 01 02 03 d16 - ; 10 11 12 13 d17 - ; 20 21 22 23 d18 - ; 30 31 32 33 d19 - vtrn.16 d16, d17 - vtrn.16 d18, d19 - - ; generate constant vectors - vdup.16 d20, r0 ; replicate cospi_8_64 - vdup.16 d21, r3 ; replicate cospi_16_64 - - ; 00 10 02 12 d16 - ; 01 11 03 13 d17 - ; 20 30 22 32 d18 - ; 21 31 23 33 d19 - vtrn.32 q8, q9 - ; 00 10 20 30 d16 - ; 01 11 21 31 d17 - ; 02 12 22 32 d18 - ; 03 13 23 33 d19 - - vdup.16 d22, r12 ; replicate cospi_24_64 - - ; do the transform on transposed rows - - ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 - vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 - - ; (input[0] + input[2]) * cospi_16_64; - ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64; - ; input[1] * cospi_8_64 + input[3] * cospi_24_64; - vmlsl.s16 q15, d19, d20 - vmlal.s16 q1, d19, d22 - - ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 - - ; stage 2 - ; output[0] = step[0] + step[3]; - ; output[1] = step[1] + step[2]; - ; output[3] = step[0] - step[3]; - ; output[2] = step[1] - step[2]; - vadd.s16 q8, q13, q14 - vsub.s16 q9, q13, q14 - vswp d18, d19 - - ; transpose the results - ; 00 01 02 03 d16 - ; 10 11 12 13 d17 - ; 20 21 22 23 d18 - ; 30 31 32 33 d19 - vtrn.16 d16, d17 - vtrn.16 d18, d19 - ; 00 10 02 12 d16 - ; 01 11 03 13 d17 - ; 20 30 22 32 d18 - ; 21 31 23 33 d19 - vtrn.32 q8, q9 - ; 00 10 20 30 d16 - ; 01 11 21 31 d17 - ; 02 12 22 32 d18 - ; 03 13 23 33 d19 - - ; do the transform on columns - - ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 - vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 - - ; (input[0] + input[2]) * cospi_16_64; - ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64; - ; input[1] * cospi_8_64 + input[3] * cospi_24_64; - vmlsl.s16 q15, d19, d20 - vmlal.s16 q1, d19, d22 - - ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 - - ; stage 2 - ; output[0] = step[0] + step[3]; - ; output[1] = step[1] + step[2]; - ; output[3] = step[0] - step[3]; - ; output[2] = step[1] - step[2]; - vadd.s16 q8, q13, q14 - vsub.s16 q9, q13, q14 - - ; The results are in two registers, one of them being swapped. This will - ; be taken care of by loading the 'dest' value in a swapped fashion and - ; also storing them in the same swapped fashion. - ; temp_out[0, 1] = d16, d17 = q8 - ; temp_out[2, 3] = d19, d18 = q9 swapped - - ; ROUND_POWER_OF_TWO(temp_out[j], 4) - vrshr.s16 q8, q8, #4 - vrshr.s16 q9, q9, #4 - - vld1.32 {d26[0]}, [r1], r2 - vld1.32 {d26[1]}, [r1], r2 - vld1.32 {d27[1]}, [r1], r2 - vld1.32 {d27[0]}, [r1] ; no post-increment - - ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d26 - vaddw.u8 q9, q9, d27 - - ; clip_pixel - vqmovun.s16 d26, q8 - vqmovun.s16 d27, q9 - - ; do the stores in reverse order with negative post-increment, by changing - ; the sign of the stride - rsb r2, r2, #0 - vst1.32 {d27[0]}, [r1], r2 - vst1.32 {d27[1]}, [r1], r2 - vst1.32 {d26[1]}, [r1], r2 - vst1.32 {d26[0]}, [r1] ; no post-increment - bx lr - ENDP ; |aom_idct4x4_16_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm new file mode 100644 index 000000000..127acf614 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm @@ -0,0 +1,193 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_idct4x4_16_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + AREA Block, CODE, READONLY ; name this block of code +;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct4x4_16_add_neon| PROC + + ; The 2D transform is done with two passes which are actually pretty + ; similar. We first transform the rows. This is done by transposing + ; the inputs, doing an SIMD column transform (the columns are the + ; transposed rows) and then transpose the results (so that it goes back + ; in normal/row positions). Then, we transform the columns by doing + ; another SIMD column transform. + ; So, two passes of a transpose followed by a column transform. + + ; load the inputs into q8-q9, d16-d19 + vld1.s16 {q8,q9}, [r0]! + + ; generate scalar constants + ; cospi_8_64 = 15137 = 0x3b21 + mov r0, #0x3b00 + add r0, #0x21 + ; cospi_16_64 = 11585 = 0x2d41 + mov r3, #0x2d00 + add r3, #0x41 + ; cospi_24_64 = 6270 = 0x 187e + mov r12, #0x1800 + add r12, #0x7e + + ; transpose the input data + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + + ; generate constant vectors + vdup.16 d20, r0 ; replicate cospi_8_64 + vdup.16 d21, r3 ; replicate cospi_16_64 + + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + vdup.16 d22, r12 ; replicate cospi_24_64 + + ; do the transform on transposed rows + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + vswp d18, d19 + + ; transpose the results + ; 00 01 02 03 d16 + ; 10 11 12 13 d17 + ; 20 21 22 23 d18 + ; 30 31 32 33 d19 + vtrn.16 d16, d17 + vtrn.16 d18, d19 + ; 00 10 02 12 d16 + ; 01 11 03 13 d17 + ; 20 30 22 32 d18 + ; 21 31 23 33 d19 + vtrn.32 q8, q9 + ; 00 10 20 30 d16 + ; 01 11 21 31 d17 + ; 02 12 22 32 d18 + ; 03 13 23 33 d19 + + ; do the transform on columns + + ; stage 1 + vadd.s16 d23, d16, d18 ; (input[0] + input[2]) + vsub.s16 d24, d16, d18 ; (input[0] - input[2]) + + vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 + vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 + + ; (input[0] + input[2]) * cospi_16_64; + ; (input[0] - input[2]) * cospi_16_64; + vmull.s16 q13, d23, d21 + vmull.s16 q14, d24, d21 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64; + ; input[1] * cospi_8_64 + input[3] * cospi_24_64; + vmlsl.s16 q15, d19, d20 + vmlal.s16 q1, d19, d22 + + ; dct_const_round_shift + vqrshrn.s32 d26, q13, #14 + vqrshrn.s32 d27, q14, #14 + vqrshrn.s32 d29, q15, #14 + vqrshrn.s32 d28, q1, #14 + + ; stage 2 + ; output[0] = step[0] + step[3]; + ; output[1] = step[1] + step[2]; + ; output[3] = step[0] - step[3]; + ; output[2] = step[1] - step[2]; + vadd.s16 q8, q13, q14 + vsub.s16 q9, q13, q14 + + ; The results are in two registers, one of them being swapped. This will + ; be taken care of by loading the 'dest' value in a swapped fashion and + ; also storing them in the same swapped fashion. + ; temp_out[0, 1] = d16, d17 = q8 + ; temp_out[2, 3] = d19, d18 = q9 swapped + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + + vld1.32 {d26[0]}, [r1], r2 + vld1.32 {d26[1]}, [r1], r2 + vld1.32 {d27[1]}, [r1], r2 + vld1.32 {d27[0]}, [r1] ; no post-increment + + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d26 + vaddw.u8 q9, q9, d27 + + ; clip_pixel + vqmovun.s16 d26, q8 + vqmovun.s16 d27, q9 + + ; do the stores in reverse order with negative post-increment, by changing + ; the sign of the stride + rsb r2, r2, #0 + vst1.32 {d27[0]}, [r1], r2 + vst1.32 {d27[1]}, [r1], r2 + vst1.32 {d26[1]}, [r1], r2 + vst1.32 {d26[0]}, [r1] ; no post-increment + bx lr + ENDP ; |aom_idct4x4_16_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm deleted file mode 100644 index ec07e2053..000000000 --- a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.asm +++ /dev/null @@ -1,91 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - - EXPORT |aom_idct8x8_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct8x8_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 5) - add r0, r0, #16 ; + (1 <<((5) - 1)) - asr r0, r0, #5 ; >> 5 - - vdup.s16 q0, r0 ; duplicate a1 - - ; load destination data - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r2 - vld1.64 {d17}, [r1] - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |aom_idct8x8_1_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm new file mode 100644 index 000000000..ec07e2053 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm @@ -0,0 +1,91 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + + + + EXPORT |aom_idct8x8_1_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, +; int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct8x8_1_add_neon| PROC + ldrsh r0, [r0] + + ; generate cospi_16_64 = 11585 + mov r12, #0x2d00 + add r12, #0x41 + + ; out = dct_const_round_shift(input[0] * cospi_16_64) + mul r0, r0, r12 ; input[0] * cospi_16_64 + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; out = dct_const_round_shift(out * cospi_16_64) + mul r0, r0, r12 ; out * cospi_16_64 + mov r12, r1 ; save dest + add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) + asr r0, r0, #14 ; >> DCT_CONST_BITS + + ; a1 = ROUND_POWER_OF_TWO(out, 5) + add r0, r0, #16 ; + (1 <<((5) - 1)) + asr r0, r0, #5 ; >> 5 + + vdup.s16 q0, r0 ; duplicate a1 + + ; load destination data + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1], r2 + vld1.64 {d16}, [r1], r2 + vld1.64 {d17}, [r1] + + vaddw.u8 q9, q0, d2 ; dest[x] + a1 + vaddw.u8 q10, q0, d3 ; dest[x] + a1 + vaddw.u8 q11, q0, d4 ; dest[x] + a1 + vaddw.u8 q12, q0, d5 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + vaddw.u8 q9, q0, d6 ; dest[x] + a1 + vaddw.u8 q10, q0, d7 ; dest[x] + a1 + vaddw.u8 q11, q0, d16 ; dest[x] + a1 + vaddw.u8 q12, q0, d17 ; dest[x] + a1 + vqmovun.s16 d2, q9 ; clip_pixel + vqmovun.s16 d3, q10 ; clip_pixel + vqmovun.s16 d30, q11 ; clip_pixel + vqmovun.s16 d31, q12 ; clip_pixel + vst1.64 {d2}, [r12], r2 + vst1.64 {d3}, [r12], r2 + vst1.64 {d30}, [r12], r2 + vst1.64 {d31}, [r12], r2 + + bx lr + ENDP ; |aom_idct8x8_1_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm deleted file mode 100644 index f3d5f246d..000000000 --- a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.asm +++ /dev/null @@ -1,522 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_idct8x8_64_add_neon| - EXPORT |aom_idct8x8_12_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are - ; loaded in q8-q15. The output will be stored back into q8-q15 registers. - ; This macro will touch q0-q7 registers and use them as buffer during - ; calculation. - MACRO - IDCT8x8_1D - ; stage 1 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r4 ; duplicate cospi_4_64 - vdup.16 d2, r5 ; duplicate cospi_12_64 - vdup.16 d3, r6 ; duplicate cospi_20_64 - - ; input[1] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; input[5] * cospi_12_64 - vmull.s16 q5, d26, d2 - vmull.s16 q6, d27, d2 - - ; input[1]*cospi_28_64-input[7]*cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q5, d22, d3 - vmlsl.s16 q6, d23, d3 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q5, #14 ; >> 14 - vqrshrn.s32 d11, q6, #14 ; >> 14 - - ; input[1] * cospi_4_64 - vmull.s16 q2, d18, d1 - vmull.s16 q3, d19, d1 - - ; input[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q13, d27, d3 - - ; input[1]*cospi_4_64+input[7]*cospi_28_64 - vmlal.s16 q2, d30, d0 - vmlal.s16 q3, d31, d0 - - ; input[5] * cospi_20_64 + input[3] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q13, d23, d2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d14, q2, #14 ; >> 14 - vqrshrn.s32 d15, q3, #14 ; >> 14 - - ; stage 2 & stage 3 - even half - vdup.16 d0, r7 ; duplicate cospi_16_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q13, #14 ; >> 14 - - ; input[0] * cospi_16_64 - vmull.s16 q2, d16, d0 - vmull.s16 q3, d17, d0 - - ; input[0] * cospi_16_64 - vmull.s16 q13, d16, d0 - vmull.s16 q15, d17, d0 - - ; (input[0] + input[2]) * cospi_16_64 - vmlal.s16 q2, d24, d0 - vmlal.s16 q3, d25, d0 - - ; (input[0] - input[2]) * cospi_16_64 - vmlsl.s16 q13, d24, d0 - vmlsl.s16 q15, d25, d0 - - vdup.16 d0, r8 ; duplicate cospi_24_64 - vdup.16 d1, r9 ; duplicate cospi_8_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d18, q2, #14 ; >> 14 - vqrshrn.s32 d19, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d22, q13, #14 ; >> 14 - vqrshrn.s32 d23, q15, #14 ; >> 14 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - ; input[1] * cospi_24_64 - vmull.s16 q2, d20, d0 - vmull.s16 q3, d21, d0 - - ; input[1] * cospi_8_64 - vmull.s16 q8, d20, d1 - vmull.s16 q12, d21, d1 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q2, d28, d1 - vmlsl.s16 q3, d29, d1 - - ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q8, d28, d0 - vmlal.s16 q12, d29, d0 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d26, q2, #14 ; >> 14 - vqrshrn.s32 d27, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d30, q8, #14 ; >> 14 - vqrshrn.s32 d31, q12, #14 ; >> 14 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - MEND - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct8x8_64_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - ; transpose the input data - TRANSPOSE8X8 - - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - - ; First transform rows - IDCT8x8_1D - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |aom_idct8x8_64_add_neon| - -;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct8x8_12_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - ; transpose the input data - TRANSPOSE8X8 - - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - - ; First transform rows - ; stage 1 - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling - ; multiply and shift the result by 16 bits instead of 14 bits. So we need - ; to double the constants before multiplying to compensate this. - mov r12, r3, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_28_64*2 - mov r12, r4, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; dct_const_round_shift(input[1] * cospi_28_64) - vqrdmulh.s16 q4, q9, q0 - - mov r12, r6, lsl #1 - rsb r12, #0 - vdup.16 q0, r12 ; duplicate -cospi_20_64*2 - - ; dct_const_round_shift(input[1] * cospi_4_64) - vqrdmulh.s16 q7, q9, q1 - - mov r12, r5, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_12_64*2 - - ; dct_const_round_shift(- input[3] * cospi_20_64) - vqrdmulh.s16 q5, q11, q0 - - mov r12, r7, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_16_64*2 - - ; dct_const_round_shift(input[3] * cospi_12_64) - vqrdmulh.s16 q6, q11, q1 - - ; stage 2 & stage 3 - even half - mov r12, r8, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_24_64*2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrdmulh.s16 q9, q8, q0 - - mov r12, r9, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_8_64*2 - - ; dct_const_round_shift(input[1] * cospi_24_64) - vqrdmulh.s16 q13, q10, q1 - - ; dct_const_round_shift(input[1] * cospi_8_64) - vqrdmulh.s16 q15, q10, q0 - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |aom_idct8x8_12_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm new file mode 100644 index 000000000..f3d5f246d --- /dev/null +++ b/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm @@ -0,0 +1,522 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_idct8x8_64_add_neon| + EXPORT |aom_idct8x8_12_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + + ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are + ; loaded in q8-q15. The output will be stored back into q8-q15 registers. + ; This macro will touch q0-q7 registers and use them as buffer during + ; calculation. + MACRO + IDCT8x8_1D + ; stage 1 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r4 ; duplicate cospi_4_64 + vdup.16 d2, r5 ; duplicate cospi_12_64 + vdup.16 d3, r6 ; duplicate cospi_20_64 + + ; input[1] * cospi_28_64 + vmull.s16 q2, d18, d0 + vmull.s16 q3, d19, d0 + + ; input[5] * cospi_12_64 + vmull.s16 q5, d26, d2 + vmull.s16 q6, d27, d2 + + ; input[1]*cospi_28_64-input[7]*cospi_4_64 + vmlsl.s16 q2, d30, d1 + vmlsl.s16 q3, d31, d1 + + ; input[5] * cospi_12_64 - input[3] * cospi_20_64 + vmlsl.s16 q5, d22, d3 + vmlsl.s16 q6, d23, d3 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d8, q2, #14 ; >> 14 + vqrshrn.s32 d9, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q5, #14 ; >> 14 + vqrshrn.s32 d11, q6, #14 ; >> 14 + + ; input[1] * cospi_4_64 + vmull.s16 q2, d18, d1 + vmull.s16 q3, d19, d1 + + ; input[5] * cospi_20_64 + vmull.s16 q9, d26, d3 + vmull.s16 q13, d27, d3 + + ; input[1]*cospi_4_64+input[7]*cospi_28_64 + vmlal.s16 q2, d30, d0 + vmlal.s16 q3, d31, d0 + + ; input[5] * cospi_20_64 + input[3] * cospi_12_64 + vmlal.s16 q9, d22, d2 + vmlal.s16 q13, d23, d2 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d14, q2, #14 ; >> 14 + vqrshrn.s32 d15, q3, #14 ; >> 14 + + ; stage 2 & stage 3 - even half + vdup.16 d0, r7 ; duplicate cospi_16_64 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q9, #14 ; >> 14 + vqrshrn.s32 d13, q13, #14 ; >> 14 + + ; input[0] * cospi_16_64 + vmull.s16 q2, d16, d0 + vmull.s16 q3, d17, d0 + + ; input[0] * cospi_16_64 + vmull.s16 q13, d16, d0 + vmull.s16 q15, d17, d0 + + ; (input[0] + input[2]) * cospi_16_64 + vmlal.s16 q2, d24, d0 + vmlal.s16 q3, d25, d0 + + ; (input[0] - input[2]) * cospi_16_64 + vmlsl.s16 q13, d24, d0 + vmlsl.s16 q15, d25, d0 + + vdup.16 d0, r8 ; duplicate cospi_24_64 + vdup.16 d1, r9 ; duplicate cospi_8_64 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d18, q2, #14 ; >> 14 + vqrshrn.s32 d19, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d22, q13, #14 ; >> 14 + vqrshrn.s32 d23, q15, #14 ; >> 14 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + ; input[1] * cospi_24_64 + vmull.s16 q2, d20, d0 + vmull.s16 q3, d21, d0 + + ; input[1] * cospi_8_64 + vmull.s16 q8, d20, d1 + vmull.s16 q12, d21, d1 + + ; input[1] * cospi_24_64 - input[3] * cospi_8_64 + vmlsl.s16 q2, d28, d1 + vmlsl.s16 q3, d29, d1 + + ; input[1] * cospi_8_64 + input[3] * cospi_24_64 + vmlal.s16 q8, d28, d0 + vmlal.s16 q12, d29, d0 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d26, q2, #14 ; >> 14 + vqrshrn.s32 d27, q3, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d30, q8, #14 ; >> 14 + vqrshrn.s32 d31, q12, #14 ; >> 14 + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; + MEND + + ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. + MACRO + TRANSPOSE8X8 + vswp d17, d24 + vswp d23, d30 + vswp d21, d28 + vswp d19, d26 + vtrn.32 q8, q10 + vtrn.32 q9, q11 + vtrn.32 q12, q14 + vtrn.32 q13, q15 + vtrn.16 q8, q9 + vtrn.16 q10, q11 + vtrn.16 q12, q13 + vtrn.16 q14, q15 + MEND + + AREA Block, CODE, READONLY ; name this block of code +;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct8x8_64_add_neon| PROC + push {r4-r9} + vpush {d8-d15} + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! + + ; transpose the input data + TRANSPOSE8X8 + + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + + ; First transform rows + IDCT8x8_1D + + ; Transpose the matrix + TRANSPOSE8X8 + + ; Then transform columns + IDCT8x8_1D + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + + vpop {d8-d15} + pop {r4-r9} + bx lr + ENDP ; |aom_idct8x8_64_add_neon| + +;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +; +; r0 int16_t input +; r1 uint8_t *dest +; r2 int dest_stride) + +|aom_idct8x8_12_add_neon| PROC + push {r4-r9} + vpush {d8-d15} + vld1.s16 {q8,q9}, [r0]! + vld1.s16 {q10,q11}, [r0]! + vld1.s16 {q12,q13}, [r0]! + vld1.s16 {q14,q15}, [r0]! + + ; transpose the input data + TRANSPOSE8X8 + + ; generate cospi_28_64 = 3196 + mov r3, #0x0c00 + add r3, #0x7c + + ; generate cospi_4_64 = 16069 + mov r4, #0x3e00 + add r4, #0xc5 + + ; generate cospi_12_64 = 13623 + mov r5, #0x3500 + add r5, #0x37 + + ; generate cospi_20_64 = 9102 + mov r6, #0x2300 + add r6, #0x8e + + ; generate cospi_16_64 = 11585 + mov r7, #0x2d00 + add r7, #0x41 + + ; generate cospi_24_64 = 6270 + mov r8, #0x1800 + add r8, #0x7e + + ; generate cospi_8_64 = 15137 + mov r9, #0x3b00 + add r9, #0x21 + + ; First transform rows + ; stage 1 + ; The following instructions use vqrdmulh to do the + ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling + ; multiply and shift the result by 16 bits instead of 14 bits. So we need + ; to double the constants before multiplying to compensate this. + mov r12, r3, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_28_64*2 + mov r12, r4, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_4_64*2 + + ; dct_const_round_shift(input[1] * cospi_28_64) + vqrdmulh.s16 q4, q9, q0 + + mov r12, r6, lsl #1 + rsb r12, #0 + vdup.16 q0, r12 ; duplicate -cospi_20_64*2 + + ; dct_const_round_shift(input[1] * cospi_4_64) + vqrdmulh.s16 q7, q9, q1 + + mov r12, r5, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_12_64*2 + + ; dct_const_round_shift(- input[3] * cospi_20_64) + vqrdmulh.s16 q5, q11, q0 + + mov r12, r7, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_16_64*2 + + ; dct_const_round_shift(input[3] * cospi_12_64) + vqrdmulh.s16 q6, q11, q1 + + ; stage 2 & stage 3 - even half + mov r12, r8, lsl #1 + vdup.16 q1, r12 ; duplicate cospi_24_64*2 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrdmulh.s16 q9, q8, q0 + + mov r12, r9, lsl #1 + vdup.16 q0, r12 ; duplicate cospi_8_64*2 + + ; dct_const_round_shift(input[1] * cospi_24_64) + vqrdmulh.s16 q13, q10, q1 + + ; dct_const_round_shift(input[1] * cospi_8_64) + vqrdmulh.s16 q15, q10, q0 + + ; stage 3 -odd half + vdup.16 d16, r7 ; duplicate cospi_16_64 + + vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] + vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] + vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] + vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] + + ; stage 2 - odd half + vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] + vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] + vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] + vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] + + ; step2[6] * cospi_16_64 + vmull.s16 q9, d28, d16 + vmull.s16 q10, d29, d16 + + ; step2[6] * cospi_16_64 + vmull.s16 q11, d28, d16 + vmull.s16 q12, d29, d16 + + ; (step2[6] - step2[5]) * cospi_16_64 + vmlsl.s16 q9, d26, d16 + vmlsl.s16 q10, d27, d16 + + ; (step2[5] + step2[6]) * cospi_16_64 + vmlal.s16 q11, d26, d16 + vmlal.s16 q12, d27, d16 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d10, q9, #14 ; >> 14 + vqrshrn.s32 d11, q10, #14 ; >> 14 + + ; dct_const_round_shift(input_dc * cospi_16_64) + vqrshrn.s32 d12, q11, #14 ; >> 14 + vqrshrn.s32 d13, q12, #14 ; >> 14 + + ; stage 4 + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; + + ; Transpose the matrix + TRANSPOSE8X8 + + ; Then transform columns + IDCT8x8_1D + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + vrshr.s16 q8, q8, #5 + vrshr.s16 q9, q9, #5 + vrshr.s16 q10, q10, #5 + vrshr.s16 q11, q11, #5 + vrshr.s16 q12, q12, #5 + vrshr.s16 q13, q13, #5 + vrshr.s16 q14, q14, #5 + vrshr.s16 q15, q15, #5 + + ; save dest pointer + mov r0, r1 + + ; load destination data + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] + + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d0 + vaddw.u8 q9, q9, d1 + vaddw.u8 q10, q10, d2 + vaddw.u8 q11, q11, d3 + vaddw.u8 q12, q12, d4 + vaddw.u8 q13, q13, d5 + vaddw.u8 q14, q14, d6 + vaddw.u8 q15, q15, d7 + + ; clip_pixel + vqmovun.s16 d0, q8 + vqmovun.s16 d1, q9 + vqmovun.s16 d2, q10 + vqmovun.s16 d3, q11 + vqmovun.s16 d4, q12 + vqmovun.s16 d5, q13 + vqmovun.s16 d6, q14 + vqmovun.s16 d7, q15 + + ; store the data + vst1.64 {d0}, [r0], r2 + vst1.64 {d1}, [r0], r2 + vst1.64 {d2}, [r0], r2 + vst1.64 {d3}, [r0], r2 + vst1.64 {d4}, [r0], r2 + vst1.64 {d5}, [r0], r2 + vst1.64 {d6}, [r0], r2 + vst1.64 {d7}, [r0], r2 + + vpop {d8-d15} + pop {r4-r9} + bx lr + ENDP ; |aom_idct8x8_12_add_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm deleted file mode 100644 index b6e2c9edb..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.asm +++ /dev/null @@ -1,202 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_4_dual_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p, -; const uint8_t *blimit0, -; const uint8_t *limit0, -; const uint8_t *thresh0, -; const uint8_t *blimit1, -; const uint8_t *limit1, -; const uint8_t *thresh1) -; r0 uint8_t *s, -; r1 int p, -; r2 const uint8_t *blimit0, -; r3 const uint8_t *limit0, -; sp const uint8_t *thresh0, -; sp+4 const uint8_t *blimit1, -; sp+8 const uint8_t *limit1, -; sp+12 const uint8_t *thresh1, - -|aom_lpf_horizontal_4_dual_neon| PROC - push {lr} - - ldr r12, [sp, #4] ; load thresh0 - vld1.8 {d0}, [r2] ; load blimit0 to first half q - vld1.8 {d2}, [r3] ; load limit0 to first half q - - add r1, r1, r1 ; double pitch - ldr r2, [sp, #8] ; load blimit1 - - vld1.8 {d4}, [r12] ; load thresh0 to first half q - - ldr r3, [sp, #12] ; load limit1 - ldr r12, [sp, #16] ; load thresh1 - vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q - - sub r2, r0, r1, lsl #1 ; s[-4 * p] - - vld1.8 {d3}, [r3] ; load limit1 to 2nd half q - vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q - - vpush {d8-d15} ; save neon registers - - add r3, r2, r1, lsr #1 ; s[-3 * p] - - vld1.u8 {q3}, [r2@64], r1 ; p3 - vld1.u8 {q4}, [r3@64], r1 ; p2 - vld1.u8 {q5}, [r2@64], r1 ; p1 - vld1.u8 {q6}, [r3@64], r1 ; p0 - vld1.u8 {q7}, [r2@64], r1 ; q0 - vld1.u8 {q8}, [r3@64], r1 ; q1 - vld1.u8 {q9}, [r2@64] ; q2 - vld1.u8 {q10}, [r3@64] ; q3 - - sub r2, r2, r1, lsl #1 - sub r3, r3, r1, lsl #1 - - bl aom_loop_filter_neon_16 - - vst1.u8 {q5}, [r2@64], r1 ; store op1 - vst1.u8 {q6}, [r3@64], r1 ; store op0 - vst1.u8 {q7}, [r2@64], r1 ; store oq0 - vst1.u8 {q8}, [r3@64], r1 ; store oq1 - - vpop {d8-d15} ; restore neon registers - - pop {pc} - ENDP ; |aom_lpf_horizontal_4_dual_neon| - -; void aom_loop_filter_neon_16(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. This function uses -; registers d8-d15, so the calling function must save those registers. -; -; r0-r3, r12 PRESERVE -; q0 blimit -; q1 limit -; q2 thresh -; q3 p3 -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 -; -; Outputs: -; q5 op1 -; q6 op0 -; q7 oq0 -; q8 oq1 -|aom_loop_filter_neon_16| PROC - - ; filter_mask - vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) - vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) - vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) - vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) - vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) - vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) - vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) - - vabd.u8 q9, q6, q7 ; abs(p0 - q0) - - vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) - - vmov.u8 q10, #0x80 - - vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) - - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) - - vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - - veor q7, q7, q10 ; qs0 - - vcge.u8 q15, q1, q15 ; abs(m11) > limit - - vshr.u8 q2, q2, #1 ; a = a / 2 - veor q6, q6, q10 ; ps0 - - veor q5, q5, q10 ; ps1 - vqadd.u8 q9, q9, q2 ; a = b + a - - veor q8, q8, q10 ; qs1 - - vmov.u16 q4, #3 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q11, d15, d13 - - vcge.u8 q9, q0, q9 ; a > blimit - - vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) - vorr q14, q13, q14 ; hev - - vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) - vmul.i16 q11, q11, q4 - - vand q1, q1, q14 ; filter &= hev - vand q15, q15, q9 ; mask - - vmov.u8 q4, #3 - - vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) - vaddw.s8 q11, q11, d3 - - vmov.u8 q9, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q11 - vand q1, q1, q15 ; filter &= mask - - vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) - vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) - vshr.s8 q2, q2, #3 ; filter2 >>= 3 - vshr.s8 q1, q1, #3 ; filter1 >>= 3 - - - vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) - vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) - - ; outer tap adjustments - vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 - - veor q7, q0, q10 ; *oq0 = u^0x80 - - vbic q1, q1, q14 ; filter &= ~hev - - vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) - vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) - - veor q6, q11, q10 ; *op0 = u^0x80 - veor q5, q13, q10 ; *op1 = u^0x80 - veor q8, q12, q10 ; *oq1 = u^0x80 - - bx lr - ENDP ; |aom_loop_filter_neon_16| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm new file mode 100644 index 000000000..b6e2c9edb --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm @@ -0,0 +1,202 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_4_dual_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp+4 const uint8_t *blimit1, +; sp+8 const uint8_t *limit1, +; sp+12 const uint8_t *thresh1, + +|aom_lpf_horizontal_4_dual_neon| PROC + push {lr} + + ldr r12, [sp, #4] ; load thresh0 + vld1.8 {d0}, [r2] ; load blimit0 to first half q + vld1.8 {d2}, [r3] ; load limit0 to first half q + + add r1, r1, r1 ; double pitch + ldr r2, [sp, #8] ; load blimit1 + + vld1.8 {d4}, [r12] ; load thresh0 to first half q + + ldr r3, [sp, #12] ; load limit1 + ldr r12, [sp, #16] ; load thresh1 + vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + + sub r2, r0, r1, lsl #1 ; s[-4 * p] + + vld1.8 {d3}, [r3] ; load limit1 to 2nd half q + vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + + vpush {d8-d15} ; save neon registers + + add r3, r2, r1, lsr #1 ; s[-3 * p] + + vld1.u8 {q3}, [r2@64], r1 ; p3 + vld1.u8 {q4}, [r3@64], r1 ; p2 + vld1.u8 {q5}, [r2@64], r1 ; p1 + vld1.u8 {q6}, [r3@64], r1 ; p0 + vld1.u8 {q7}, [r2@64], r1 ; q0 + vld1.u8 {q8}, [r3@64], r1 ; q1 + vld1.u8 {q9}, [r2@64] ; q2 + vld1.u8 {q10}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl aom_loop_filter_neon_16 + + vst1.u8 {q5}, [r2@64], r1 ; store op1 + vst1.u8 {q6}, [r3@64], r1 ; store op0 + vst1.u8 {q7}, [r2@64], r1 ; store oq0 + vst1.u8 {q8}, [r3@64], r1 ; store oq1 + + vpop {d8-d15} ; restore neon registers + + pop {pc} + ENDP ; |aom_lpf_horizontal_4_dual_neon| + +; void aom_loop_filter_neon_16(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. This function uses +; registers d8-d15, so the calling function must save those registers. +; +; r0-r3, r12 PRESERVE +; q0 blimit +; q1 limit +; q2 thresh +; q3 p3 +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 +; +; Outputs: +; q5 op1 +; q6 op0 +; q7 oq0 +; q8 oq1 +|aom_loop_filter_neon_16| PROC + + ; filter_mask + vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) + vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) + vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) + vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) + vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) + vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) + vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) + + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) + + vmov.u8 q10, #0x80 + + vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) + + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) + + vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 + + veor q7, q7, q10 ; qs0 + + vcge.u8 q15, q1, q15 ; abs(m11) > limit + + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 + + vmov.u16 q4, #3 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vcge.u8 q9, q0, q9 ; a > blimit + + vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) + vorr q14, q13, q14 ; hev + + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; filter &= hev + vand q15, q15, q9 ; mask + + vmov.u8 q4, #3 + + vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) + vaddw.s8 q11, q11, d3 + + vmov.u8 q9, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q11 + vand q1, q1, q15 ; filter &= mask + + vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) + vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) + vshr.s8 q2, q2, #3 ; filter2 >>= 3 + vshr.s8 q1, q1, #3 ; filter1 >>= 3 + + + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) + vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 + + veor q7, q0, q10 ; *oq0 = u^0x80 + + vbic q1, q1, q14 ; filter &= ~hev + + vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) + vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) + + veor q6, q11, q10 ; *op0 = u^0x80 + veor q5, q13, q10 ; *op1 = u^0x80 + veor q8, q12, q10 ; *oq1 = u^0x80 + + bx lr + ENDP ; |aom_loop_filter_neon_16| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm deleted file mode 100644 index 8b54984d5..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.asm +++ /dev/null @@ -1,252 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_4_neon| - EXPORT |aom_lpf_vertical_4_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; Currently aom only works on iterations 8 at a time. The aom loop filter -; works on 16 iterations at a time. -; -; void aom_lpf_horizontal_4_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_horizontal_4_neon| PROC - push {lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r2, [sp, #4] ; load thresh - add r1, r1, r1 ; double pitch - - vld1.8 {d1[]}, [r3] ; duplicate *limit - vld1.8 {d2[]}, [r2] ; duplicate *thresh - - sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines - add r3, r2, r1, lsr #1 ; set to 3 lines down - - vld1.u8 {d3}, [r2@64], r1 ; p3 - vld1.u8 {d4}, [r3@64], r1 ; p2 - vld1.u8 {d5}, [r2@64], r1 ; p1 - vld1.u8 {d6}, [r3@64], r1 ; p0 - vld1.u8 {d7}, [r2@64], r1 ; q0 - vld1.u8 {d16}, [r3@64], r1 ; q1 - vld1.u8 {d17}, [r2@64] ; q2 - vld1.u8 {d18}, [r3@64] ; q3 - - sub r2, r2, r1, lsl #1 - sub r3, r3, r1, lsl #1 - - bl aom_loop_filter_neon - - vst1.u8 {d4}, [r2@64], r1 ; store op1 - vst1.u8 {d5}, [r3@64], r1 ; store op0 - vst1.u8 {d6}, [r2@64], r1 ; store oq0 - vst1.u8 {d7}, [r3@64], r1 ; store oq1 - - pop {pc} - ENDP ; |aom_lpf_horizontal_4_neon| - -; Currently aom only works on iterations 8 at a time. The aom loop filter -; works on 16 iterations at a time. -; -; void aom_lpf_vertical_4_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_vertical_4_neon| PROC - push {lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - vld1.8 {d1[]}, [r3] ; duplicate *limit - - ldr r3, [sp, #4] ; load thresh - sub r2, r0, #4 ; move s pointer down by 4 columns - - vld1.8 {d2[]}, [r3] ; duplicate *thresh - - vld1.u8 {d3}, [r2], r1 ; load s data - vld1.u8 {d4}, [r2], r1 - vld1.u8 {d5}, [r2], r1 - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d7}, [r2], r1 - vld1.u8 {d16}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d18}, [r2] - - ;transpose to 8x16 matrix - vtrn.32 d3, d7 - vtrn.32 d4, d16 - vtrn.32 d5, d17 - vtrn.32 d6, d18 - - vtrn.16 d3, d5 - vtrn.16 d4, d6 - vtrn.16 d7, d17 - vtrn.16 d16, d18 - - vtrn.8 d3, d4 - vtrn.8 d5, d6 - vtrn.8 d7, d16 - vtrn.8 d17, d18 - - bl aom_loop_filter_neon - - sub r0, r0, #2 - - ;store op1, op0, oq0, oq1 - vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 - vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 - vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 - vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 - vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 - vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 - vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 - vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] - - pop {pc} - ENDP ; |aom_lpf_vertical_4_neon| - -; void aom_loop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. The function does not use -; registers d8-d15. -; -; Inputs: -; r0-r3, r12 PRESERVE -; d0 blimit -; d1 limit -; d2 thresh -; d3 p3 -; d4 p2 -; d5 p1 -; d6 p0 -; d7 q0 -; d16 q1 -; d17 q2 -; d18 q3 -; -; Outputs: -; d4 op1 -; d5 op0 -; d6 oq0 -; d7 oq1 -|aom_loop_filter_neon| PROC - ; filter_mask - vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) - vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) - vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) - vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) - vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) - vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) - vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) - - vabd.u8 d17, d6, d7 ; abs(p0 - q0) - - vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) - - vmov.u8 d18, #0x80 - - vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) - - ; hevmask - vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) - - vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) - vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 - - veor d7, d7, d18 ; qs0 - - vcge.u8 d23, d1, d23 ; abs(m1) > limit - - ; filter() function - ; convert to signed - - vshr.u8 d28, d28, #1 ; a = a / 2 - veor d6, d6, d18 ; ps0 - - veor d5, d5, d18 ; ps1 - vqadd.u8 d17, d17, d28 ; a = b + a - - veor d16, d16, d18 ; qs1 - - vmov.u8 d19, #3 - - vsub.s8 d28, d7, d6 ; ( qs0 - ps0) - - vcge.u8 d17, d0, d17 ; a > blimit - - vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) - vorr d22, d21, d22 ; hevmask - - vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) - - vand d27, d27, d22 ; filter &= hev - vand d23, d23, d17 ; filter_mask - - vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) - - vmov.u8 d17, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d27, q12 - - vand d27, d27, d23 ; filter &= mask - - vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) - vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) - vshr.s8 d28, d28, #3 ; filter2 >>= 3 - vshr.s8 d27, d27, #3 ; filter1 >>= 3 - - vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) - vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) - - ; outer tap adjustments - vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 - - veor d6, d26, d18 ; *oq0 = u^0x80 - - vbic d27, d27, d22 ; filter &= ~hev - - vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) - vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) - - veor d5, d19, d18 ; *op0 = u^0x80 - veor d4, d21, d18 ; *op1 = u^0x80 - veor d7, d20, d18 ; *oq1 = u^0x80 - - bx lr - ENDP ; |aom_loop_filter_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm new file mode 100644 index 000000000..8b54984d5 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm @@ -0,0 +1,252 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_4_neon| + EXPORT |aom_lpf_vertical_4_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently aom only works on iterations 8 at a time. The aom loop filter +; works on 16 iterations at a time. +; +; void aom_lpf_horizontal_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_horizontal_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r2, [sp, #4] ; load thresh + add r1, r1, r1 ; double pitch + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + + sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r3, r2, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r2@64], r1 ; p3 + vld1.u8 {d4}, [r3@64], r1 ; p2 + vld1.u8 {d5}, [r2@64], r1 ; p1 + vld1.u8 {d6}, [r3@64], r1 ; p0 + vld1.u8 {d7}, [r2@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r2@64] ; q2 + vld1.u8 {d18}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl aom_loop_filter_neon + + vst1.u8 {d4}, [r2@64], r1 ; store op1 + vst1.u8 {d5}, [r3@64], r1 ; store op0 + vst1.u8 {d6}, [r2@64], r1 ; store oq0 + vst1.u8 {d7}, [r3@64], r1 ; store oq1 + + pop {pc} + ENDP ; |aom_lpf_horizontal_4_neon| + +; Currently aom only works on iterations 8 at a time. The aom loop filter +; works on 16 iterations at a time. +; +; void aom_lpf_vertical_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_vertical_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #4] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + bl aom_loop_filter_neon + + sub r0, r0, #2 + + ;store op1, op0, oq0, oq1 + vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 + vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 + vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 + vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 + vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 + vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 + vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 + vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] + + pop {pc} + ENDP ; |aom_lpf_vertical_4_neon| + +; void aom_loop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d4 op1 +; d5 op0 +; d6 oq0 +; d7 oq1 +|aom_loop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d17, d6, d7 ; abs(p0 - q0) + + vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) + + vmov.u8 d18, #0x80 + + vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) + + ; hevmask + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) + + vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 + + veor d7, d7, d18 ; qs0 + + vcge.u8 d23, d1, d23 ; abs(m1) > limit + + ; filter() function + ; convert to signed + + vshr.u8 d28, d28, #1 ; a = a / 2 + veor d6, d6, d18 ; ps0 + + veor d5, d5, d18 ; ps1 + vqadd.u8 d17, d17, d28 ; a = b + a + + veor d16, d16, d18 ; qs1 + + vmov.u8 d19, #3 + + vsub.s8 d28, d7, d6 ; ( qs0 - ps0) + + vcge.u8 d17, d0, d17 ; a > blimit + + vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) + vorr d22, d21, d22 ; hevmask + + vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) + + vand d27, d27, d22 ; filter &= hev + vand d23, d23, d17 ; filter_mask + + vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d17, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d27, q12 + + vand d27, d27, d23 ; filter &= mask + + vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) + vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) + vshr.s8 d28, d28, #3 ; filter2 >>= 3 + vshr.s8 d27, d27, #3 ; filter1 >>= 3 + + vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) + vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 + + veor d6, d26, d18 ; *oq0 = u^0x80 + + vbic d27, d27, d22 ; filter &= ~hev + + vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) + vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) + + veor d5, d19, d18 ; *op0 = u^0x80 + veor d4, d21, d18 ; *op1 = u^0x80 + veor d7, d20, d18 ; *oq1 = u^0x80 + + bx lr + ENDP ; |aom_loop_filter_neon| + + END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm deleted file mode 100644 index 9f3db66ee..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.asm +++ /dev/null @@ -1,428 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_8_neon| - EXPORT |aom_lpf_vertical_8_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; Currently aom only works on iterations 8 at a time. The aom loop filter -; works on 16 iterations at a time. -; -; void aom_lpf_horizontal_8_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_horizontal_8_neon| PROC - push {r4-r5, lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r2, [sp, #12] ; load thresh - add r1, r1, r1 ; double pitch - - vld1.8 {d1[]}, [r3] ; duplicate *limit - vld1.8 {d2[]}, [r2] ; duplicate *thresh - - sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines - add r2, r3, r1, lsr #1 ; set to 3 lines down - - vld1.u8 {d3}, [r3@64], r1 ; p3 - vld1.u8 {d4}, [r2@64], r1 ; p2 - vld1.u8 {d5}, [r3@64], r1 ; p1 - vld1.u8 {d6}, [r2@64], r1 ; p0 - vld1.u8 {d7}, [r3@64], r1 ; q0 - vld1.u8 {d16}, [r2@64], r1 ; q1 - vld1.u8 {d17}, [r3@64] ; q2 - vld1.u8 {d18}, [r2@64], r1 ; q3 - - sub r3, r3, r1, lsl #1 - sub r2, r2, r1, lsl #2 - - bl aom_mbloop_filter_neon - - vst1.u8 {d0}, [r2@64], r1 ; store op2 - vst1.u8 {d1}, [r3@64], r1 ; store op1 - vst1.u8 {d2}, [r2@64], r1 ; store op0 - vst1.u8 {d3}, [r3@64], r1 ; store oq0 - vst1.u8 {d4}, [r2@64], r1 ; store oq1 - vst1.u8 {d5}, [r3@64], r1 ; store oq2 - - pop {r4-r5, pc} - - ENDP ; |aom_lpf_horizontal_8_neon| - -; void aom_lpf_vertical_8_neon(uint8_t *s, -; int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; -; r0 uint8_t *s, -; r1 int pitch, -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_vertical_8_neon| PROC - push {r4-r5, lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - vld1.8 {d1[]}, [r3] ; duplicate *limit - - ldr r3, [sp, #12] ; load thresh - sub r2, r0, #4 ; move s pointer down by 4 columns - - vld1.8 {d2[]}, [r3] ; duplicate *thresh - - vld1.u8 {d3}, [r2], r1 ; load s data - vld1.u8 {d4}, [r2], r1 - vld1.u8 {d5}, [r2], r1 - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d7}, [r2], r1 - vld1.u8 {d16}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d18}, [r2] - - ;transpose to 8x16 matrix - vtrn.32 d3, d7 - vtrn.32 d4, d16 - vtrn.32 d5, d17 - vtrn.32 d6, d18 - - vtrn.16 d3, d5 - vtrn.16 d4, d6 - vtrn.16 d7, d17 - vtrn.16 d16, d18 - - vtrn.8 d3, d4 - vtrn.8 d5, d6 - vtrn.8 d7, d16 - vtrn.8 d17, d18 - - sub r2, r0, #3 - add r3, r0, #1 - - bl aom_mbloop_filter_neon - - ;store op2, op1, op0, oq0 - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1 - vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1 - vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1 - vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1 - vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1 - vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1 - vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1 - vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2] - - ;store oq1, oq2 - vst2.8 {d4[0], d5[0]}, [r3], r1 - vst2.8 {d4[1], d5[1]}, [r3], r1 - vst2.8 {d4[2], d5[2]}, [r3], r1 - vst2.8 {d4[3], d5[3]}, [r3], r1 - vst2.8 {d4[4], d5[4]}, [r3], r1 - vst2.8 {d4[5], d5[5]}, [r3], r1 - vst2.8 {d4[6], d5[6]}, [r3], r1 - vst2.8 {d4[7], d5[7]}, [r3] - - pop {r4-r5, pc} - ENDP ; |aom_lpf_vertical_8_neon| - -; void aom_mbloop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. The function does not use -; registers d8-d15. -; -; Inputs: -; r0-r3, r12 PRESERVE -; d0 blimit -; d1 limit -; d2 thresh -; d3 p3 -; d4 p2 -; d5 p1 -; d6 p0 -; d7 q0 -; d16 q1 -; d17 q2 -; d18 q3 -; -; Outputs: -; d0 op2 -; d1 op1 -; d2 op0 -; d3 oq0 -; d4 oq1 -; d5 oq2 -|aom_mbloop_filter_neon| PROC - ; filter_mask - vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) - vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) - vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) - vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) - vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1) - vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) - vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) - - vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2) - - vmax.u8 d23, d23, d24 ; m3 = max(m5, m6) - - vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2) - - vmax.u8 d19, d19, d20 - - vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0) - vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0) - vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0) - - vmax.u8 d19, d19, d23 - - vabd.u8 d23, d5, d16 ; a = abs(p1 - q1) - vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 - - ; abs () > limit - vcge.u8 d19, d1, d19 - - ; only compare the largest value to thresh - vmax.u8 d25, d25, d26 ; m4 = max(m7, m8) - vmax.u8 d26, d27, d28 ; m5 = max(m10, m11) - - vshr.u8 d23, d23, #1 ; a = a / 2 - - vmax.u8 d25, d25, d26 ; m4 = max(m4, m5) - - vqadd.u8 d24, d24, d23 ; a = b + a - - vmax.u8 d20, d20, d25 ; m2 = max(m2, m4) - - vmov.u8 d23, #1 - vcge.u8 d24, d0, d24 ; a > blimit - - vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 - - vcge.u8 d20, d23, d20 ; flat - - vand d19, d19, d24 ; mask - - vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1 - - vand d20, d20, d19 ; flat & mask - - vmov.u8 d22, #0x80 - - vorr d23, d21, d23 ; hev - - ; This instruction will truncate the "flat & mask" masks down to 4 bits - ; each to fit into one 32 bit arm register. The values are stored in - ; q10.64[0]. - vshrn.u16 d30, q10, #4 - vmov.u32 r4, d30[0] ; flat & mask 4bits - - adds r5, r4, #1 ; Check for all 1's - - ; If mask and flat are 1's for all vectors, then we only need to execute - ; the power branch for all vectors. - beq power_branch_only - - cmp r4, #0 ; Check for 0, set flag for later - - ; mbfilter() function - ; filter() function - ; convert to signed - veor d21, d7, d22 ; qs0 - veor d24, d6, d22 ; ps0 - veor d25, d5, d22 ; ps1 - veor d26, d16, d22 ; qs1 - - vmov.u8 d27, #3 - - vsub.s8 d28, d21, d24 ; ( qs0 - ps0) - - vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) - - vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) - - vand d29, d29, d23 ; filter &= hev - - vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) - - vmov.u8 d29, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d28, q15 - - vand d28, d28, d19 ; filter &= mask - - vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) - vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) - vshr.s8 d30, d30, #3 ; filter2 >>= 3 - vshr.s8 d29, d29, #3 ; filter1 >>= 3 - - vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) - vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1) - - ; outer tap adjustments: ++filter1 >> 1 - vrshr.s8 d29, d29, #1 - vbic d29, d29, d23 ; filter &= ~hev - - vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) - vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) - - ; If mask and flat are 0's for all vectors, then we only need to execute - ; the filter branch for all vectors. - beq filter_branch_only - - ; If mask and flat are mixed then we must perform both branches and - ; combine the data. - veor d24, d24, d22 ; *f_op0 = u^0x80 - veor d21, d21, d22 ; *f_oq0 = u^0x80 - veor d25, d25, d22 ; *f_op1 = u^0x80 - veor d26, d26, d22 ; *f_oq1 = u^0x80 - - ; At this point we have already executed the filter branch. The filter - ; branch does not set op2 or oq2, so use p2 and q2. Execute the power - ; branch and combine the data. - vmov.u8 d23, #2 - vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0 - vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3 - vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2 - - vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask) - - vaddw.u8 q14, d5 ; r_op2 += p1 - - vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask) - - vqrshrn.u16 d30, q14, #3 ; r_op2 - - vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3 - vsubw.u8 q14, d4 ; r_op1 -= p2 - vaddw.u8 q14, d5 ; r_op1 += p1 - vaddw.u8 q14, d16 ; r_op1 += q1 - - vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask) - - vqrshrn.u16 d31, q14, #3 ; r_op1 - - vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3 - vsubw.u8 q14, d5 ; r_op0 -= p1 - vaddw.u8 q14, d6 ; r_op0 += p0 - vaddw.u8 q14, d17 ; r_op0 += q2 - - vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask) - - vqrshrn.u16 d23, q14, #3 ; r_op0 - - vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3 - vsubw.u8 q14, d6 ; r_oq0 -= p0 - vaddw.u8 q14, d7 ; r_oq0 += q0 - - vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask) - - vaddw.u8 q14, d18 ; oq0 += q3 - - vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask) - - vqrshrn.u16 d22, q14, #3 ; r_oq0 - - vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2 - vsubw.u8 q14, d7 ; r_oq1 -= q0 - vaddw.u8 q14, d16 ; r_oq1 += q1 - - vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask) - - vaddw.u8 q14, d18 ; r_oq1 += q3 - - vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) - - vqrshrn.u16 d6, q14, #3 ; r_oq1 - - vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1 - vsubw.u8 q14, d16 ; r_oq2 -= q1 - vaddw.u8 q14, d17 ; r_oq2 += q2 - vaddw.u8 q14, d18 ; r_oq2 += q3 - - vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask) - - vqrshrn.u16 d7, q14, #3 ; r_oq2 - - vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask) - vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask) - vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask) - - bx lr - -power_branch_only - vmov.u8 d27, #3 - vmov.u8 d21, #2 - vaddl.u8 q14, d6, d7 ; op2 = p0 + q0 - vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 - vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 - vaddw.u8 q14, d5 ; op2 += p1 - vqrshrn.u16 d0, q14, #3 ; op2 - - vsubw.u8 q14, d3 ; op1 = op2 - p3 - vsubw.u8 q14, d4 ; op1 -= p2 - vaddw.u8 q14, d5 ; op1 += p1 - vaddw.u8 q14, d16 ; op1 += q1 - vqrshrn.u16 d1, q14, #3 ; op1 - - vsubw.u8 q14, d3 ; op0 = op1 - p3 - vsubw.u8 q14, d5 ; op0 -= p1 - vaddw.u8 q14, d6 ; op0 += p0 - vaddw.u8 q14, d17 ; op0 += q2 - vqrshrn.u16 d2, q14, #3 ; op0 - - vsubw.u8 q14, d3 ; oq0 = op0 - p3 - vsubw.u8 q14, d6 ; oq0 -= p0 - vaddw.u8 q14, d7 ; oq0 += q0 - vaddw.u8 q14, d18 ; oq0 += q3 - vqrshrn.u16 d3, q14, #3 ; oq0 - - vsubw.u8 q14, d4 ; oq1 = oq0 - p2 - vsubw.u8 q14, d7 ; oq1 -= q0 - vaddw.u8 q14, d16 ; oq1 += q1 - vaddw.u8 q14, d18 ; oq1 += q3 - vqrshrn.u16 d4, q14, #3 ; oq1 - - vsubw.u8 q14, d5 ; oq2 = oq1 - p1 - vsubw.u8 q14, d16 ; oq2 -= q1 - vaddw.u8 q14, d17 ; oq2 += q2 - vaddw.u8 q14, d18 ; oq2 += q3 - vqrshrn.u16 d5, q14, #3 ; oq2 - - bx lr - -filter_branch_only - ; TODO(fgalligan): See if we can rearange registers so we do not need to - ; do the 2 vswp. - vswp d0, d4 ; op2 - vswp d5, d17 ; oq2 - veor d2, d24, d22 ; *op0 = u^0x80 - veor d3, d21, d22 ; *oq0 = u^0x80 - veor d1, d25, d22 ; *op1 = u^0x80 - veor d4, d26, d22 ; *oq1 = u^0x80 - - bx lr - - ENDP ; |aom_mbloop_filter_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm new file mode 100644 index 000000000..9f3db66ee --- /dev/null +++ b/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm @@ -0,0 +1,428 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + + EXPORT |aom_lpf_horizontal_8_neon| + EXPORT |aom_lpf_vertical_8_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently aom only works on iterations 8 at a time. The aom loop filter +; works on 16 iterations at a time. +; +; void aom_lpf_horizontal_8_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_horizontal_8_neon| PROC + push {r4-r5, lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r2, [sp, #12] ; load thresh + add r1, r1, r1 ; double pitch + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + + sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r2, r3, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r3@64], r1 ; p3 + vld1.u8 {d4}, [r2@64], r1 ; p2 + vld1.u8 {d5}, [r3@64], r1 ; p1 + vld1.u8 {d6}, [r2@64], r1 ; p0 + vld1.u8 {d7}, [r3@64], r1 ; q0 + vld1.u8 {d16}, [r2@64], r1 ; q1 + vld1.u8 {d17}, [r3@64] ; q2 + vld1.u8 {d18}, [r2@64], r1 ; q3 + + sub r3, r3, r1, lsl #1 + sub r2, r2, r1, lsl #2 + + bl aom_mbloop_filter_neon + + vst1.u8 {d0}, [r2@64], r1 ; store op2 + vst1.u8 {d1}, [r3@64], r1 ; store op1 + vst1.u8 {d2}, [r2@64], r1 ; store op0 + vst1.u8 {d3}, [r3@64], r1 ; store oq0 + vst1.u8 {d4}, [r2@64], r1 ; store oq1 + vst1.u8 {d5}, [r3@64], r1 ; store oq2 + + pop {r4-r5, pc} + + ENDP ; |aom_lpf_horizontal_8_neon| + +; void aom_lpf_vertical_8_neon(uint8_t *s, +; int pitch, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; +; r0 uint8_t *s, +; r1 int pitch, +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|aom_lpf_vertical_8_neon| PROC + push {r4-r5, lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #12] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + sub r2, r0, #3 + add r3, r0, #1 + + bl aom_mbloop_filter_neon + + ;store op2, op1, op0, oq0 + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1 + vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1 + vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1 + vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1 + vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1 + vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1 + vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1 + vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2] + + ;store oq1, oq2 + vst2.8 {d4[0], d5[0]}, [r3], r1 + vst2.8 {d4[1], d5[1]}, [r3], r1 + vst2.8 {d4[2], d5[2]}, [r3], r1 + vst2.8 {d4[3], d5[3]}, [r3], r1 + vst2.8 {d4[4], d5[4]}, [r3], r1 + vst2.8 {d4[5], d5[5]}, [r3], r1 + vst2.8 {d4[6], d5[6]}, [r3], r1 + vst2.8 {d4[7], d5[7]}, [r3] + + pop {r4-r5, pc} + ENDP ; |aom_lpf_vertical_8_neon| + +; void aom_mbloop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d0 op2 +; d1 op1 +; d2 op0 +; d3 oq0 +; d4 oq1 +; d5 oq2 +|aom_mbloop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2) + + vmax.u8 d23, d23, d24 ; m3 = max(m5, m6) + + vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2) + + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0) + vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0) + vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d19, d1, d19 + + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; m4 = max(m7, m8) + vmax.u8 d26, d27, d28 ; m5 = max(m10, m11) + + vshr.u8 d23, d23, #1 ; a = a / 2 + + vmax.u8 d25, d25, d26 ; m4 = max(m4, m5) + + vqadd.u8 d24, d24, d23 ; a = b + a + + vmax.u8 d20, d20, d25 ; m2 = max(m2, m4) + + vmov.u8 d23, #1 + vcge.u8 d24, d0, d24 ; a > blimit + + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + + vcge.u8 d20, d23, d20 ; flat + + vand d19, d19, d24 ; mask + + vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + + vand d20, d20, d19 ; flat & mask + + vmov.u8 d22, #0x80 + + vorr d23, d21, d23 ; hev + + ; This instruction will truncate the "flat & mask" masks down to 4 bits + ; each to fit into one 32 bit arm register. The values are stored in + ; q10.64[0]. + vshrn.u16 d30, q10, #4 + vmov.u32 r4, d30[0] ; flat & mask 4bits + + adds r5, r4, #1 ; Check for all 1's + + ; If mask and flat are 1's for all vectors, then we only need to execute + ; the power branch for all vectors. + beq power_branch_only + + cmp r4, #0 ; Check for 0, set flag for later + + ; mbfilter() function + ; filter() function + ; convert to signed + veor d21, d7, d22 ; qs0 + veor d24, d6, d22 ; ps0 + veor d25, d5, d22 ; ps1 + veor d26, d16, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d21, d24 ; ( qs0 - ps0) + + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + + vand d29, d29, d23 ; filter &= hev + + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d29, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 + + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d23 ; filter &= ~hev + + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + + ; If mask and flat are 0's for all vectors, then we only need to execute + ; the filter branch for all vectors. + beq filter_branch_only + + ; If mask and flat are mixed then we must perform both branches and + ; combine the data. + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d21, d21, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 + + ; At this point we have already executed the filter branch. The filter + ; branch does not set op2 or oq2, so use p2 and q2. Execute the power + ; branch and combine the data. + vmov.u8 d23, #2 + vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3 + vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2 + + vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask) + + vaddw.u8 q14, d5 ; r_op2 += p1 + + vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask) + + vqrshrn.u16 d30, q14, #3 ; r_op2 + + vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3 + vsubw.u8 q14, d4 ; r_op1 -= p2 + vaddw.u8 q14, d5 ; r_op1 += p1 + vaddw.u8 q14, d16 ; r_op1 += q1 + + vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask) + + vqrshrn.u16 d31, q14, #3 ; r_op1 + + vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3 + vsubw.u8 q14, d5 ; r_op0 -= p1 + vaddw.u8 q14, d6 ; r_op0 += p0 + vaddw.u8 q14, d17 ; r_op0 += q2 + + vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask) + + vqrshrn.u16 d23, q14, #3 ; r_op0 + + vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3 + vsubw.u8 q14, d6 ; r_oq0 -= p0 + vaddw.u8 q14, d7 ; r_oq0 += q0 + + vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask) + + vaddw.u8 q14, d18 ; oq0 += q3 + + vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask) + + vqrshrn.u16 d22, q14, #3 ; r_oq0 + + vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2 + vsubw.u8 q14, d7 ; r_oq1 -= q0 + vaddw.u8 q14, d16 ; r_oq1 += q1 + + vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask) + + vaddw.u8 q14, d18 ; r_oq1 += q3 + + vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) + + vqrshrn.u16 d6, q14, #3 ; r_oq1 + + vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1 + vsubw.u8 q14, d16 ; r_oq2 -= q1 + vaddw.u8 q14, d17 ; r_oq2 += q2 + vaddw.u8 q14, d18 ; r_oq2 += q3 + + vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask) + + vqrshrn.u16 d7, q14, #3 ; r_oq2 + + vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask) + vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask) + vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask) + + bx lr + +power_branch_only + vmov.u8 d27, #3 + vmov.u8 d21, #2 + vaddl.u8 q14, d6, d7 ; op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 + vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 + vaddw.u8 q14, d5 ; op2 += p1 + vqrshrn.u16 d0, q14, #3 ; op2 + + vsubw.u8 q14, d3 ; op1 = op2 - p3 + vsubw.u8 q14, d4 ; op1 -= p2 + vaddw.u8 q14, d5 ; op1 += p1 + vaddw.u8 q14, d16 ; op1 += q1 + vqrshrn.u16 d1, q14, #3 ; op1 + + vsubw.u8 q14, d3 ; op0 = op1 - p3 + vsubw.u8 q14, d5 ; op0 -= p1 + vaddw.u8 q14, d6 ; op0 += p0 + vaddw.u8 q14, d17 ; op0 += q2 + vqrshrn.u16 d2, q14, #3 ; op0 + + vsubw.u8 q14, d3 ; oq0 = op0 - p3 + vsubw.u8 q14, d6 ; oq0 -= p0 + vaddw.u8 q14, d7 ; oq0 += q0 + vaddw.u8 q14, d18 ; oq0 += q3 + vqrshrn.u16 d3, q14, #3 ; oq0 + + vsubw.u8 q14, d4 ; oq1 = oq0 - p2 + vsubw.u8 q14, d7 ; oq1 -= q0 + vaddw.u8 q14, d16 ; oq1 += q1 + vaddw.u8 q14, d18 ; oq1 += q3 + vqrshrn.u16 d4, q14, #3 ; oq1 + + vsubw.u8 q14, d5 ; oq2 = oq1 - p1 + vsubw.u8 q14, d16 ; oq2 -= q1 + vaddw.u8 q14, d17 ; oq2 += q2 + vaddw.u8 q14, d18 ; oq2 += q3 + vqrshrn.u16 d5, q14, #3 ; oq2 + + bx lr + +filter_branch_only + ; TODO(fgalligan): See if we can rearange registers so we do not need to + ; do the 2 vswp. + vswp d0, d4 ; op2 + vswp d5, d17 ; oq2 + veor d2, d24, d22 ; *op0 = u^0x80 + veor d3, d21, d22 ; *oq0 = u^0x80 + veor d1, d25, d22 ; *op1 = u^0x80 + veor d4, d26, d22 ; *oq1 = u^0x80 + + bx lr + + ENDP ; |aom_mbloop_filter_neon| + + END diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm deleted file mode 100644 index 91b3d126c..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.asm +++ /dev/null @@ -1,259 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_4: times 8 dw 4 -pw_8: times 8 dw 8 -pw_16: times 4 dd 16 -pw_32: times 4 dd 32 - -SECTION .text -INIT_XMM sse2 -cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - - movq m0, [aboveq] - movq m2, [leftq] - paddw m0, m2 - pshuflw m1, m0, 0xe - paddw m0, m1 - pshuflw m1, m0, 0x1 - paddw m0, m1 - paddw m0, [GLOBAL(pw_4)] - psraw m0, 3 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [leftq] - DEFINE_ARGS dst, stride, stride3, one - mov oned, 0x00010001 - lea stride3q, [strideq*3] - movd m3, oned - pshufd m3, m3, 0x0 - paddw m0, m2 - pmaddwd m0, m3 - packssdw m0, m1 - pmaddwd m0, m3 - packssdw m0, m1 - pmaddwd m0, m3 - paddw m0, [GLOBAL(pw_8)] - psrlw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - lea dstq, [dstq+strideq*8] - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m3, [aboveq+16] - mova m2, [leftq] - mova m4, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - movhlps m2, m0 - paddw m0, m2 - punpcklwd m0, m1 - movhlps m2, m0 - paddd m0, m2 - punpckldq m0, m1 - movhlps m2, m0 - paddd m0, m2 - paddd m0, [GLOBAL(pw_16)] - psrad m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2 +16], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4 +16], m0 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2+16], m0 - lea dstq, [dstq+strideq*8] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset - GET_GOT goffsetq - - mova m0, [aboveq] - mova m2, [aboveq+16] - mova m3, [aboveq+32] - mova m4, [aboveq+48] - paddw m0, m2 - paddw m3, m4 - mova m2, [leftq] - mova m4, [leftq+16] - mova m5, [leftq+32] - mova m6, [leftq+48] - paddw m2, m4 - paddw m5, m6 - paddw m0, m3 - paddw m2, m5 - pxor m1, m1 - paddw m0, m2 - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - movhlps m2, m0 - paddw m0, m2 - punpcklwd m0, m1 - movhlps m2, m0 - paddd m0, m2 - punpckldq m0, m1 - movhlps m2, m0 - paddd m0, m2 - paddd m0, [GLOBAL(pw_32)] - psrad m0, 6 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16 ], m0 - mova [dstq +32 ], m0 - mova [dstq +48 ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16 ], m0 - mova [dstq+strideq*2+32 ], m0 - mova [dstq+strideq*2+48 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4+16 ], m0 - mova [dstq+strideq*4+32 ], m0 - mova [dstq+strideq*4+48 ], m0 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2 +16], m0 - mova [dstq+stride3q*2 +32], m0 - mova [dstq+stride3q*2 +48], m0 - lea dstq, [dstq+strideq*8] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above - movq m0, [aboveq] - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - lea dstq, [dstq+strideq*8] - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above - mova m0, [aboveq] - mova m1, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 4 -.loop: - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2 +16], m1 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4 +16], m1 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2+16], m1 - lea dstq, [dstq+strideq*8] - dec nlines4d - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above - mova m0, [aboveq] - mova m1, [aboveq+16] - mova m2, [aboveq+32] - mova m3, [aboveq+48] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 8 -.loop: - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq +32], m2 - mova [dstq +48], m3 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2 +16], m1 - mova [dstq+strideq*2 +32], m2 - mova [dstq+strideq*2 +48], m3 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4 +16], m1 - mova [dstq+strideq*4 +32], m2 - mova [dstq+strideq*4 +48], m3 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2 +16], m1 - mova [dstq+stride3q*2 +32], m2 - mova [dstq+stride3q*2 +48], m3 - lea dstq, [dstq+strideq*8] - dec nlines4d - jnz .loop - REP_RET diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm new file mode 100644 index 000000000..91b3d126c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm @@ -0,0 +1,259 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 4 dd 16 +pw_32: times 4 dd 32 + +SECTION .text +INIT_XMM sse2 +cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + movq m2, [leftq] + paddw m0, m2 + pshuflw m1, m0, 0xe + paddw m0, m1 + pshuflw m1, m0, 0x1 + paddw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, one + mov oned, 0x00010001 + lea stride3q, [strideq*3] + movd m3, oned + pshufd m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_8)] + psrlw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m3, [aboveq+16] + mova m2, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_16)] + psrad m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset + GET_GOT goffsetq + + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + paddw m0, m2 + paddw m3, m4 + mova m2, [leftq] + mova m4, [leftq+16] + mova m5, [leftq+32] + mova m6, [leftq+48] + paddw m2, m4 + paddw m5, m6 + paddw m0, m3 + paddw m2, m5 + pxor m1, m1 + paddw m0, m2 + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_32)] + psrad m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16 ], m0 + mova [dstq +32 ], m0 + mova [dstq +48 ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16 ], m0 + mova [dstq+strideq*2+32 ], m0 + mova [dstq+strideq*2+48 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4+16 ], m0 + mova [dstq+strideq*4+32 ], m0 + mova [dstq+strideq*4+48 ], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m0 + mova [dstq+stride3q*2 +32], m0 + mova [dstq+stride3q*2 +48], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m1 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + mova m2, [aboveq+32] + mova m3, [aboveq+48] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq +32], m2 + mova [dstq +48], m3 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*2 +32], m2 + mova [dstq+strideq*2 +48], m3 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+strideq*4 +32], m2 + mova [dstq+strideq*4 +48], m3 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m1 + mova [dstq+stride3q*2 +32], m2 + mova [dstq+stride3q*2 +48], m3 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2.asm deleted file mode 100644 index 9aece27be..000000000 --- a/third_party/aom/aom_dsp/x86/intrapred_sse2.asm +++ /dev/null @@ -1,625 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pb_1: times 16 db 1 -pw_4: times 8 dw 4 -pw_8: times 8 dw 8 -pw_16: times 8 dw 16 -pw_32: times 8 dw 32 -dc_128: times 16 db 128 -pw2_4: times 8 dw 2 -pw2_8: times 8 dw 4 -pw2_16: times 8 dw 8 -pw2_32: times 8 dw 16 - -SECTION .text - -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 - pavgb %4, %1, %3 - pxor %3, %1 - pand %3, [GLOBAL(pb_1)] - psubb %4, %3 - pavgb %4, %2 -%endmacro - -INIT_XMM sse2 -cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - movd m2, [leftq] - movd m0, [aboveq] - pxor m1, m1 - punpckldq m0, m2 - psadbw m0, m1 - paddw m0, [GLOBAL(pw_4)] - psraw m0, 3 - pshuflw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset - movifnidn leftq, leftmp - GET_GOT goffsetq - - pxor m1, m1 - movd m0, [leftq] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_4)] - psraw m0, 2 - pshuflw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movd m0, [aboveq] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_4)] - psraw m0, 2 - pshuflw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [aboveq] - movq m2, [leftq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - paddw m0, [GLOBAL(pw_8)] - psraw m0, 4 - punpcklbw m0, m0 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_8)] - psraw m0, 3 - punpcklbw m0, m0 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset - movifnidn leftq, leftmp - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [leftq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_8)] - psraw m0, 3 - punpcklbw m0, m0 - pshuflw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movd m0, [GLOBAL(dc_128)] - movd [dstq ], m0 - movd [dstq+strideq ], m0 - movd [dstq+strideq*2], m0 - movd [dstq+stride3q ], m0 - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movq m0, [GLOBAL(dc_128)] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [leftq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw_16)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - - -INIT_XMM sse2 -cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_16)] - psraw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [leftq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_16)] - psraw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - mova m0, [GLOBAL(dc_128)] -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - RESTORE_GOT - RET - - -INIT_XMM sse2 -cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [aboveq+16] - mova m3, [leftq] - mova m4, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - psadbw m3, m1 - psadbw m4, m1 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw_32)] - psraw m0, 6 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_32)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [leftq] - mova m2, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_32)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - mova m0, [GLOBAL(dc_128)] -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above - movd m0, [aboveq] - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - RET - -INIT_XMM sse2 -cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above - movq m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - RET - -INIT_XMM sse2 -cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 4 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec nlines4d - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above - mova m0, [aboveq] - mova m1, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 8 -.loop: - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m1 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m1 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m1 - lea dstq, [dstq+strideq*4] - dec nlines4d - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left - movifnidn leftq, leftmp - movd m0, [leftq] - punpcklbw m0, m0 - punpcklbw m0, m0 - pshufd m1, m0, 0x1 - movd [dstq ], m0 - movd [dstq+strideq], m1 - pshufd m2, m0, 0x2 - lea dstq, [dstq+strideq*2] - pshufd m3, m0, 0x3 - movd [dstq ], m2 - movd [dstq+strideq], m3 - RET - -INIT_XMM sse2 -cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left - movifnidn leftq, leftmp - mov lineq, -2 - DEFINE_ARGS dst, stride, line, left, stride3 - lea stride3q, [strideq*3] - movq m0, [leftq ] - punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 -.loop: - pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 - pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 - movq [dstq ], m1 - movq [dstq+strideq], m2 - pshuflw m1, m0, 0xaa - pshuflw m2, m0, 0xff - movq [dstq+strideq*2], m1 - movq [dstq+stride3q ], m2 - pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 - inc lineq - lea dstq, [dstq+strideq*4] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left - movifnidn leftq, leftmp - mov lineq, -4 - DEFINE_ARGS dst, stride, line, left, stride3 - lea stride3q, [strideq*3] -.loop: - movd m0, [leftq] - punpcklbw m0, m0 - punpcklbw m0, m0 ; l1 to l4 each repeated 4 times - pshufd m1, m0, 0x0 ; l1 repeated 16 times - pshufd m2, m0, 0x55 ; l2 repeated 16 times - mova [dstq ], m1 - mova [dstq+strideq ], m2 - pshufd m1, m0, 0xaa - pshufd m2, m0, 0xff - mova [dstq+strideq*2], m1 - mova [dstq+stride3q ], m2 - inc lineq - lea leftq, [leftq+4 ] - lea dstq, [dstq+strideq*4] - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left - movifnidn leftq, leftmp - mov lineq, -8 - DEFINE_ARGS dst, stride, line, left, stride3 - lea stride3q, [strideq*3] -.loop: - movd m0, [leftq] - punpcklbw m0, m0 - punpcklbw m0, m0 ; l1 to l4 each repeated 4 times - pshufd m1, m0, 0x0 ; l1 repeated 16 times - pshufd m2, m0, 0x55 ; l2 repeated 16 times - mova [dstq ], m1 - mova [dstq+16 ], m1 - mova [dstq+strideq ], m2 - mova [dstq+strideq+16 ], m2 - pshufd m1, m0, 0xaa - pshufd m2, m0, 0xff - mova [dstq+strideq*2 ], m1 - mova [dstq+strideq*2+16], m1 - mova [dstq+stride3q ], m2 - mova [dstq+stride3q+16 ], m2 - inc lineq - lea leftq, [leftq+4 ] - lea dstq, [dstq+strideq*4] - jnz .loop - REP_RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm new file mode 100644 index 000000000..9aece27be --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm @@ -0,0 +1,625 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pb_1: times 16 db 1 +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 8 dw 16 +pw_32: times 8 dw 32 +dc_128: times 16 db 128 +pw2_4: times 8 dw 2 +pw2_8: times 8 dw 4 +pw2_16: times 8 dw 8 +pw2_32: times 8 dw 16 + +SECTION .text + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM sse2 +cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + movd m2, [leftq] + movd m0, [aboveq] + pxor m1, m1 + punpckldq m0, m2 + psadbw m0, m1 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [leftq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movd m0, [aboveq] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_4)] + psraw m0, 2 + pshuflw m0, m0, 0x0 + packuswb m0, m0 + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + paddw m0, [GLOBAL(pw_8)] + psraw m0, 4 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset + movifnidn leftq, leftmp + GET_GOT goffsetq + + pxor m1, m1 + movq m0, [leftq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + psadbw m0, m1 + paddw m0, [GLOBAL(pw2_8)] + psraw m0, 3 + punpcklbw m0, m0 + pshuflw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movd m0, [GLOBAL(dc_128)] + movd [dstq ], m0 + movd [dstq+strideq ], m0 + movd [dstq+strideq*2], m0 + movd [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq m0, [GLOBAL(dc_128)] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_16)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + + +INIT_XMM sse2 +cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + psadbw m0, m1 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_16)] + psraw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + + +INIT_XMM sse2 +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + psadbw m3, m1 + psadbw m4, m1 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw_32)] + psraw m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [leftq] + mova m2, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + psadbw m0, m1 + psadbw m2, m1 + paddw m0, m2 + movhlps m2, m0 + paddw m0, m2 + paddw m0, [GLOBAL(pw2_32)] + psraw m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + packuswb m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +INIT_XMM sse2 +cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset + GET_GOT goffsetq + + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + mova m0, [GLOBAL(dc_128)] +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m0 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m0 + lea dstq, [dstq+strideq*4] + dec lines4d + jnz .loop + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above + movd m0, [aboveq] + movd [dstq ], m0 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + movd [dstq ], m0 + movd [dstq+strideq], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq ], m0 + movq [dstq+strideq*2], m0 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM sse2 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq+strideq ], m0 + mova [dstq+strideq*2], m0 + mova [dstq+stride3q ], m0 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq ], m0 + mova [dstq+strideq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q +16], m1 + lea dstq, [dstq+strideq*4] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left + movifnidn leftq, leftmp + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 + pshufd m1, m0, 0x1 + movd [dstq ], m0 + movd [dstq+strideq], m1 + pshufd m2, m0, 0x2 + lea dstq, [dstq+strideq*2] + pshufd m3, m0, 0x3 + movd [dstq ], m2 + movd [dstq+strideq], m3 + RET + +INIT_XMM sse2 +cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -2 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] + movq m0, [leftq ] + punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8 +.loop: + pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1 + pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2 + movq [dstq ], m1 + movq [dstq+strideq], m2 + pshuflw m1, m0, 0xaa + pshuflw m2, m0, 0xff + movq [dstq+strideq*2], m1 + movq [dstq+stride3q ], m2 + pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8 + inc lineq + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -4 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+strideq ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2], m1 + mova [dstq+stride3q ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left + movifnidn leftq, leftmp + mov lineq, -8 + DEFINE_ARGS dst, stride, line, left, stride3 + lea stride3q, [strideq*3] +.loop: + movd m0, [leftq] + punpcklbw m0, m0 + punpcklbw m0, m0 ; l1 to l4 each repeated 4 times + pshufd m1, m0, 0x0 ; l1 repeated 16 times + pshufd m2, m0, 0x55 ; l2 repeated 16 times + mova [dstq ], m1 + mova [dstq+16 ], m1 + mova [dstq+strideq ], m2 + mova [dstq+strideq+16 ], m2 + pshufd m1, m0, 0xaa + pshufd m2, m0, 0xff + mova [dstq+strideq*2 ], m1 + mova [dstq+strideq*2+16], m1 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m2 + inc lineq + lea leftq, [leftq+4 ] + lea dstq, [dstq+strideq*4] + jnz .loop + REP_RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm b/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm deleted file mode 100644 index bc1bb2ff3..000000000 --- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.asm +++ /dev/null @@ -1,410 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA - -pb_1: times 16 db 1 -sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 -sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 -sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 -sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -SECTION .text - -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 - pavgb %4, %1, %3 - pxor %3, %1 - pand %3, [GLOBAL(pb_1)] - psubb %4, %3 - pavgb %4, %2 -%endmacro - -INIT_XMM ssse3 -cglobal d63e_predictor_4x4, 3, 4, 5, dst, stride, above, goffset - GET_GOT goffsetq - - movq m3, [aboveq] - pshufb m1, m3, [GLOBAL(sh_b23456777)] - pshufb m2, m3, [GLOBAL(sh_b12345677)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 - pavgb m3, m2 - - ; store 4 lines - movd [dstq ], m3 - movd [dstq+strideq], m4 - lea dstq, [dstq+strideq*2] - psrldq m3, 1 - psrldq m4, 1 - movd [dstq ], m3 - movd [dstq+strideq], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - movd m0, [leftq] ; l1, l2, l3, l4 - movd m1, [aboveq-1] ; tl, t1, t2, t3 - punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 - pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 - psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 - psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 - ; comments below are for a predictor like this - ; A1 B1 C1 D1 - ; A2 B2 A1 B1 - ; A3 B3 A2 B2 - ; A4 B4 A3 B3 - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 - pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 - - punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. - movd [dstq+stride3q ], m3 - psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. - movd [dstq+strideq*2], m3 - psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. - movd [dstq+strideq ], m3 - psrldq m3, 2 ; A1 B1 C1 D1 .. - movd [dstq ], m3 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - movq m0, [leftq] ; [0- 7] l1-8 [byte] - movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] - pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] - pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] - pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] - pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] - psrldq m4, m0, 1 ; t1-7 [word] - psrldq m5, m0, 2 ; t2-7 [word] - ; comments below are for a predictor like this - ; A1 B1 C1 D1 E1 F1 G1 H1 - ; A2 B2 A1 B1 C1 D1 E1 F1 - ; A3 B3 A2 B2 A1 B1 C1 D1 - ; A4 B4 A3 B3 A2 B2 A1 B1 - ; A5 B5 A4 B4 A3 B3 A2 B2 - ; A6 B6 A5 B5 A4 B4 A3 B3 - ; A7 B7 A6 B6 A5 B5 A4 B4 - ; A8 B8 A7 B7 A6 B6 A5 B5 - pavgb m6, m1, m2 ; 2-tap avg A8-A1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 - - punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - - movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 - palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 - movq [dstq+strideq*2], m0 - psrldq m0, 2 ; A-B2, A-B1, C-H1 - movq [dstq+strideq ], m0 - psrldq m0, 2 ; A-H1 - movq [dstq ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 - psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 - movq [dstq+strideq*2], m6 - psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 - movq [dstq+strideq ], m6 - psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 - movq [dstq ], m6 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - mova m0, [leftq] - movu m7, [aboveq-1] - ; comments below are for a predictor like this - ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 - ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 - ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 - ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 - ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 - ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 - ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 - ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 - ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 - ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 - ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 - ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 - ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 - ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 - ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 - ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 - pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] - palignr m5, m0, m6, 15 - palignr m3, m0, m6, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg - pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] - pavgb m5, m0 ; A1 - Ag - - punpcklbw m0, m4, m5 ; A-B8 ... A-B1 - punpckhbw m4, m5 ; A-B9 ... A-Bg - - pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] - pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 - - pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - palignr m2, m1, m6, 14 - mova [dstq ], m2 - palignr m2, m1, m6, 12 - mova [dstq+strideq ], m2 - palignr m2, m1, m6, 10 - mova [dstq+strideq*2], m2 - palignr m2, m1, m6, 8 - mova [dstq+stride3q ], m2 - lea dstq, [dstq+strideq*4] - palignr m2, m1, m6, 6 - mova [dstq ], m2 - palignr m2, m1, m6, 4 - mova [dstq+strideq ], m2 - palignr m2, m1, m6, 2 - mova [dstq+strideq*2], m2 - pshufb m4, [GLOBAL(sh_bfedcba9876543210)] - mova [dstq+stride3q ], m6 - lea dstq, [dstq+strideq*4] - - palignr m2, m6, m4, 14 - mova [dstq ], m2 - palignr m2, m6, m4, 12 - mova [dstq+strideq ], m2 - palignr m2, m6, m4, 10 - mova [dstq+strideq*2], m2 - palignr m2, m6, m4, 8 - mova [dstq+stride3q ], m2 - lea dstq, [dstq+strideq*4] - palignr m2, m6, m4, 6 - mova [dstq ], m2 - palignr m2, m6, m4, 4 - mova [dstq+strideq ], m2 - palignr m2, m6, m4, 2 - mova [dstq+strideq*2], m2 - mova [dstq+stride3q ], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - mova m0, [leftq] - movu m7, [aboveq-1] - movu m1, [aboveq+15] - - pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] - - palignr m3, m1, m7, 1 - palignr m5, m1, m7, 2 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] - - pshufb m7, [GLOBAL(sh_bfedcba9876543210)] - palignr m5, m0, m7, 15 - palignr m3, m0, m7, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg - pavgb m5, m0 ; A1 - Ag - punpcklbw m6, m4, m5 ; A-B8 ... A-B1 - punpckhbw m4, m5 ; A-B9 ... A-Bg - pshufb m6, [GLOBAL(sh_bfedcba9876543210)] - pshufb m4, [GLOBAL(sh_bfedcba9876543210)] - - DEFINE_ARGS dst, stride, stride3, left, line - lea stride3q, [strideq*3] - - palignr m5, m2, m1, 14 - palignr m7, m1, m6, 14 - mova [dstq ], m7 - mova [dstq+16 ], m5 - palignr m5, m2, m1, 12 - palignr m7, m1, m6, 12 - mova [dstq+strideq ], m7 - mova [dstq+strideq+16 ], m5 - palignr m5, m2, m1, 10 - palignr m7, m1, m6, 10 - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m5 - palignr m5, m2, m1, 8 - palignr m7, m1, m6, 8 - mova [dstq+stride3q ], m7 - mova [dstq+stride3q+16 ], m5 - lea dstq, [dstq+strideq*4] - palignr m5, m2, m1, 6 - palignr m7, m1, m6, 6 - mova [dstq ], m7 - mova [dstq+16 ], m5 - palignr m5, m2, m1, 4 - palignr m7, m1, m6, 4 - mova [dstq+strideq ], m7 - mova [dstq+strideq+16 ], m5 - palignr m5, m2, m1, 2 - palignr m7, m1, m6, 2 - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m5 - mova [dstq+stride3q ], m6 - mova [dstq+stride3q+16 ], m1 - lea dstq, [dstq+strideq*4] - - palignr m5, m1, m6, 14 - palignr m3, m6, m4, 14 - mova [dstq ], m3 - mova [dstq+16 ], m5 - palignr m5, m1, m6, 12 - palignr m3, m6, m4, 12 - mova [dstq+strideq ], m3 - mova [dstq+strideq+16 ], m5 - palignr m5, m1, m6, 10 - palignr m3, m6, m4, 10 - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m5 - palignr m5, m1, m6, 8 - palignr m3, m6, m4, 8 - mova [dstq+stride3q ], m3 - mova [dstq+stride3q+16 ], m5 - lea dstq, [dstq+strideq*4] - palignr m5, m1, m6, 6 - palignr m3, m6, m4, 6 - mova [dstq ], m3 - mova [dstq+16 ], m5 - palignr m5, m1, m6, 4 - palignr m3, m6, m4, 4 - mova [dstq+strideq ], m3 - mova [dstq+strideq+16 ], m5 - palignr m5, m1, m6, 2 - palignr m3, m6, m4, 2 - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m5 - mova [dstq+stride3q ], m4 - mova [dstq+stride3q+16 ], m6 - lea dstq, [dstq+strideq*4] - - mova m7, [leftq] - mova m3, [leftq+16] - palignr m5, m3, m7, 15 - palignr m0, m3, m7, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - - pavgb m5, m3 ; Ah - - punpcklbw m3, m2, m5 ; A-B8 ... A-B1 - punpckhbw m2, m5 ; A-B9 ... A-Bg - pshufb m3, [GLOBAL(sh_bfedcba9876543210)] - pshufb m2, [GLOBAL(sh_bfedcba9876543210)] - - palignr m7, m6, m4, 14 - palignr m0, m4, m3, 14 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m6, m4, 12 - palignr m0, m4, m3, 12 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m6, m4, 10 - palignr m0, m4, m3, 10 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - palignr m7, m6, m4, 8 - palignr m0, m4, m3, 8 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q+16 ], m7 - lea dstq, [dstq+strideq*4] - palignr m7, m6, m4, 6 - palignr m0, m4, m3, 6 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m6, m4, 4 - palignr m0, m4, m3, 4 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m6, m4, 2 - palignr m0, m4, m3, 2 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m3 - mova [dstq+stride3q+16 ], m4 - lea dstq, [dstq+strideq*4] - - palignr m7, m4, m3, 14 - palignr m0, m3, m2, 14 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m4, m3, 12 - palignr m0, m3, m2, 12 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m4, m3, 10 - palignr m0, m3, m2, 10 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - palignr m7, m4, m3, 8 - palignr m0, m3, m2, 8 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q+16 ], m7 - lea dstq, [dstq+strideq*4] - palignr m7, m4, m3, 6 - palignr m0, m3, m2, 6 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m4, m3, 4 - palignr m0, m3, m2, 4 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m4, m3, 2 - palignr m0, m3, m2, 2 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m2 - mova [dstq+stride3q+16 ], m3 - - RESTORE_GOT - RET diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm new file mode 100644 index 000000000..bc1bb2ff3 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm @@ -0,0 +1,410 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA + +pb_1: times 16 db 1 +sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 +sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 +sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 +sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 +sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +SECTION .text + +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 + pavgb %4, %1, %3 + pxor %3, %1 + pand %3, [GLOBAL(pb_1)] + psubb %4, %3 + pavgb %4, %2 +%endmacro + +INIT_XMM ssse3 +cglobal d63e_predictor_4x4, 3, 4, 5, dst, stride, above, goffset + GET_GOT goffsetq + + movq m3, [aboveq] + pshufb m1, m3, [GLOBAL(sh_b23456777)] + pshufb m2, m3, [GLOBAL(sh_b12345677)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 + pavgb m3, m2 + + ; store 4 lines + movd [dstq ], m3 + movd [dstq+strideq], m4 + lea dstq, [dstq+strideq*2] + psrldq m3, 1 + psrldq m4, 1 + movd [dstq ], m3 + movd [dstq+strideq], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + movd m0, [leftq] ; l1, l2, l3, l4 + movd m1, [aboveq-1] ; tl, t1, t2, t3 + punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 + pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 + psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 + psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 + ; comments below are for a predictor like this + ; A1 B1 C1 D1 + ; A2 B2 A1 B1 + ; A3 B3 A2 B2 + ; A4 B4 A3 B3 + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 + pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 + + punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+stride3q ], m3 + psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq*2], m3 + psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. + movd [dstq+strideq ], m3 + psrldq m3, 2 ; A1 B1 C1 D1 .. + movd [dstq ], m3 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + movq m0, [leftq] ; [0- 7] l1-8 [byte] + movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] + pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] + pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] + pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] + pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] + psrldq m4, m0, 1 ; t1-7 [word] + psrldq m5, m0, 2 ; t2-7 [word] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 + ; A2 B2 A1 B1 C1 D1 E1 F1 + ; A3 B3 A2 B2 A1 B1 C1 D1 + ; A4 B4 A3 B3 A2 B2 A1 B1 + ; A5 B5 A4 B4 A3 B3 A2 B2 + ; A6 B6 A5 B5 A4 B4 A3 B3 + ; A7 B7 A6 B6 A5 B5 A4 B4 + ; A8 B8 A7 B7 A6 B6 A5 B5 + pavgb m6, m1, m2 ; 2-tap avg A8-A1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 + + punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 + + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + + movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 + palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 + movq [dstq+strideq*2], m0 + psrldq m0, 2 ; A-B2, A-B1, C-H1 + movq [dstq+strideq ], m0 + psrldq m0, 2 ; A-H1 + movq [dstq ], m0 + lea dstq, [dstq+strideq*4] + movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 + psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 + movq [dstq+strideq*2], m6 + psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 + movq [dstq+strideq ], m6 + psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 + movq [dstq ], m6 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + ; comments below are for a predictor like this + ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 + ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 + ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 + ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 + ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 + ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 + ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 + ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 + ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 + ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 + ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 + ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 + ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 + ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 + ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 + ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 + pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m6, 15 + palignr m3, m0, m6, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] + pavgb m5, m0 ; A1 - Ag + + punpcklbw m0, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + + pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] + pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 + + pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + palignr m2, m1, m6, 14 + mova [dstq ], m2 + palignr m2, m1, m6, 12 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 10 + mova [dstq+strideq*2], m2 + palignr m2, m1, m6, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m1, m6, 6 + mova [dstq ], m2 + palignr m2, m1, m6, 4 + mova [dstq+strideq ], m2 + palignr m2, m1, m6, 2 + mova [dstq+strideq*2], m2 + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + mova [dstq+stride3q ], m6 + lea dstq, [dstq+strideq*4] + + palignr m2, m6, m4, 14 + mova [dstq ], m2 + palignr m2, m6, m4, 12 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 10 + mova [dstq+strideq*2], m2 + palignr m2, m6, m4, 8 + mova [dstq+stride3q ], m2 + lea dstq, [dstq+strideq*4] + palignr m2, m6, m4, 6 + mova [dstq ], m2 + palignr m2, m6, m4, 4 + mova [dstq+strideq ], m2 + palignr m2, m6, m4, 2 + mova [dstq+strideq*2], m2 + mova [dstq+stride3q ], m4 + RESTORE_GOT + RET + +INIT_XMM ssse3 +cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset + GET_GOT goffsetq + mova m0, [leftq] + movu m7, [aboveq-1] + movu m1, [aboveq+15] + + pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] + pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] + + palignr m3, m1, m7, 1 + palignr m5, m1, m7, 2 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] + + pshufb m7, [GLOBAL(sh_bfedcba9876543210)] + palignr m5, m0, m7, 15 + palignr m3, m0, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg + pavgb m5, m0 ; A1 - Ag + punpcklbw m6, m4, m5 ; A-B8 ... A-B1 + punpckhbw m4, m5 ; A-B9 ... A-Bg + pshufb m6, [GLOBAL(sh_bfedcba9876543210)] + pshufb m4, [GLOBAL(sh_bfedcba9876543210)] + + DEFINE_ARGS dst, stride, stride3, left, line + lea stride3q, [strideq*3] + + palignr m5, m2, m1, 14 + palignr m7, m1, m6, 14 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 12 + palignr m7, m1, m6, 12 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 10 + palignr m7, m1, m6, 10 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + palignr m5, m2, m1, 8 + palignr m7, m1, m6, 8 + mova [dstq+stride3q ], m7 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m2, m1, 6 + palignr m7, m1, m6, 6 + mova [dstq ], m7 + mova [dstq+16 ], m5 + palignr m5, m2, m1, 4 + palignr m7, m1, m6, 4 + mova [dstq+strideq ], m7 + mova [dstq+strideq+16 ], m5 + palignr m5, m2, m1, 2 + palignr m7, m1, m6, 2 + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m6 + mova [dstq+stride3q+16 ], m1 + lea dstq, [dstq+strideq*4] + + palignr m5, m1, m6, 14 + palignr m3, m6, m4, 14 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 12 + palignr m3, m6, m4, 12 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 10 + palignr m3, m6, m4, 10 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + palignr m5, m1, m6, 8 + palignr m3, m6, m4, 8 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m5 + lea dstq, [dstq+strideq*4] + palignr m5, m1, m6, 6 + palignr m3, m6, m4, 6 + mova [dstq ], m3 + mova [dstq+16 ], m5 + palignr m5, m1, m6, 4 + palignr m3, m6, m4, 4 + mova [dstq+strideq ], m3 + mova [dstq+strideq+16 ], m5 + palignr m5, m1, m6, 2 + palignr m3, m6, m4, 2 + mova [dstq+strideq*2 ], m3 + mova [dstq+strideq*2+16], m5 + mova [dstq+stride3q ], m4 + mova [dstq+stride3q+16 ], m6 + lea dstq, [dstq+strideq*4] + + mova m7, [leftq] + mova m3, [leftq+16] + palignr m5, m3, m7, 15 + palignr m0, m3, m7, 14 + + X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - + pavgb m5, m3 ; Ah - + punpcklbw m3, m2, m5 ; A-B8 ... A-B1 + punpckhbw m2, m5 ; A-B9 ... A-Bg + pshufb m3, [GLOBAL(sh_bfedcba9876543210)] + pshufb m2, [GLOBAL(sh_bfedcba9876543210)] + + palignr m7, m6, m4, 14 + palignr m0, m4, m3, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 12 + palignr m0, m4, m3, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 10 + palignr m0, m4, m3, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m6, m4, 8 + palignr m0, m4, m3, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m6, m4, 6 + palignr m0, m4, m3, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m6, m4, 4 + palignr m0, m4, m3, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m6, m4, 2 + palignr m0, m4, m3, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m3 + mova [dstq+stride3q+16 ], m4 + lea dstq, [dstq+strideq*4] + + palignr m7, m4, m3, 14 + palignr m0, m3, m2, 14 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 12 + palignr m0, m3, m2, 12 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 10 + palignr m0, m3, m2, 10 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + palignr m7, m4, m3, 8 + palignr m0, m3, m2, 8 + mova [dstq+stride3q ], m0 + mova [dstq+stride3q+16 ], m7 + lea dstq, [dstq+strideq*4] + palignr m7, m4, m3, 6 + palignr m0, m3, m2, 6 + mova [dstq ], m0 + mova [dstq+16 ], m7 + palignr m7, m4, m3, 4 + palignr m0, m3, m2, 4 + mova [dstq+strideq ], m0 + mova [dstq+strideq+16 ], m7 + palignr m7, m4, m3, 2 + palignr m0, m3, m2, 2 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16], m7 + mova [dstq+stride3q ], m2 + mova [dstq+stride3q+16 ], m3 + + RESTORE_GOT + RET -- cgit v1.2.3 From 125aff11b7587a55d5a94b1337e44cbc68655c0b Mon Sep 17 00:00:00 2001 From: trav90 Date: Thu, 18 Oct 2018 21:56:49 -0500 Subject: Fix aom compile errors with VS2015 Import BUG=aomedia:900 --- third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c | 1 + 1 file changed, 1 insertion(+) (limited to 'third_party/aom/aom_dsp') diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c index 41b55c985..e001a1d70 100644 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c @@ -11,6 +11,7 @@ #include +#include "aom_ports/msvc.h" #include "./aom_dsp_rtcd.h" // ----------------------------------------------------------------------------- -- cgit v1.2.3 From bbcc64772580c8a979288791afa02d30bc476d2e Mon Sep 17 00:00:00 2001 From: trav90 Date: Fri, 19 Oct 2018 21:52:15 -0500 Subject: Update aom to v1.0.0 Update aom to commit id d14c5bb4f336ef1842046089849dee4a301fbbf0. --- third_party/aom/aom_dsp/add_noise.c | 4 +- third_party/aom/aom_dsp/ans.h | 44 - third_party/aom/aom_dsp/ansreader.h | 214 -- third_party/aom/aom_dsp/answriter.h | 148 - third_party/aom/aom_dsp/aom_convolve.c | 1171 +------ third_party/aom/aom_dsp/aom_convolve.h | 12 +- third_party/aom/aom_dsp/aom_dsp.cmake | 817 ++--- third_party/aom/aom_dsp/aom_dsp.mk | 439 --- third_party/aom/aom_dsp/aom_dsp_common.h | 29 +- third_party/aom/aom_dsp/aom_dsp_rtcd.c | 6 +- third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl | 1248 ++++--- third_party/aom/aom_dsp/aom_filter.h | 7 + third_party/aom/aom_dsp/aom_simd.h | 5 +- .../aom/aom_dsp/arm/aom_convolve8_avg_neon.c | 364 -- .../aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm | 295 -- third_party/aom/aom_dsp/arm/aom_convolve8_neon.c | 331 -- .../aom/aom_dsp/arm/aom_convolve8_neon_asm.asm | 273 -- .../aom/aom_dsp/arm/aom_convolve_avg_neon.c | 145 - .../aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm | 119 - .../aom/aom_dsp/arm/aom_convolve_copy_neon.c | 93 - .../aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm | 87 - third_party/aom/aom_dsp/arm/aom_convolve_neon.c | 66 - third_party/aom/aom_dsp/arm/avg_neon.c | 216 -- third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c | 449 +++ third_party/aom/aom_dsp/arm/fwd_txfm_neon.c | 3 +- third_party/aom/aom_dsp/arm/hadamard_neon.c | 200 -- third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c | 59 - .../aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm | 201 -- third_party/aom/aom_dsp/arm/idct16x16_add_neon.c | 1295 -------- .../aom/aom_dsp/arm/idct16x16_add_neon_asm.asm | 1182 ------- third_party/aom/aom_dsp/arm/idct16x16_neon.c | 152 - third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c | 141 - .../aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm | 147 - third_party/aom/aom_dsp/arm/idct32x32_add_neon.c | 686 ---- .../aom/aom_dsp/arm/idct32x32_add_neon_asm.asm | 1302 -------- third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c | 47 - .../aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm | 71 - third_party/aom/aom_dsp/arm/idct4x4_add_neon.c | 146 - .../aom/aom_dsp/arm/idct4x4_add_neon_asm.asm | 193 -- third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c | 62 - .../aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm | 91 - third_party/aom/aom_dsp/arm/idct8x8_add_neon.c | 509 --- .../aom/aom_dsp/arm/idct8x8_add_neon_asm.asm | 522 --- third_party/aom/aom_dsp/arm/intrapred_neon.c | 8 +- third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm | 287 -- third_party/aom/aom_dsp/arm/loopfilter_16_neon.c | 174 - .../aom/aom_dsp/arm/loopfilter_16_neon_asm.asm | 202 -- third_party/aom/aom_dsp/arm/loopfilter_4_neon.c | 250 -- .../aom/aom_dsp/arm/loopfilter_4_neon_asm.asm | 252 -- third_party/aom/aom_dsp/arm/loopfilter_8_neon.c | 430 --- .../aom/aom_dsp/arm/loopfilter_8_neon_asm.asm | 428 --- third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm | 638 ---- third_party/aom/aom_dsp/arm/loopfilter_neon.c | 707 +++- third_party/aom/aom_dsp/arm/sad4d_neon.c | 5 +- third_party/aom/aom_dsp/arm/sad_neon.c | 2 +- third_party/aom/aom_dsp/arm/save_reg_neon.asm | 39 - third_party/aom/aom_dsp/arm/subpel_variance_neon.c | 5 +- third_party/aom/aom_dsp/arm/subtract_neon.c | 3 +- third_party/aom/aom_dsp/arm/variance_neon.c | 4 +- third_party/aom/aom_dsp/avg.c | 190 -- third_party/aom/aom_dsp/binary_codes_reader.c | 104 +- third_party/aom/aom_dsp/binary_codes_reader.h | 20 +- third_party/aom/aom_dsp/binary_codes_writer.c | 67 +- third_party/aom/aom_dsp/binary_codes_writer.h | 12 +- third_party/aom/aom_dsp/bitreader.h | 109 +- third_party/aom/aom_dsp/bitreader_buffer.c | 15 +- third_party/aom/aom_dsp/bitreader_buffer.h | 2 +- third_party/aom/aom_dsp/bitwriter.h | 132 +- third_party/aom/aom_dsp/bitwriter_buffer.c | 15 +- third_party/aom/aom_dsp/bitwriter_buffer.h | 5 + third_party/aom/aom_dsp/blend.h | 3 + third_party/aom/aom_dsp/blend_a64_hmask.c | 8 +- third_party/aom/aom_dsp/blend_a64_mask.c | 187 +- third_party/aom/aom_dsp/blend_a64_vmask.c | 8 +- third_party/aom/aom_dsp/buf_ans.h | 4 +- third_party/aom/aom_dsp/daalaboolreader.c | 4 + third_party/aom/aom_dsp/daalaboolreader.h | 8 +- third_party/aom/aom_dsp/daalaboolwriter.c | 5 +- third_party/aom/aom_dsp/daalaboolwriter.h | 9 +- third_party/aom/aom_dsp/entcode.c | 4 - third_party/aom/aom_dsp/entcode.h | 20 +- third_party/aom/aom_dsp/entdec.c | 89 +- third_party/aom/aom_dsp/entdec.h | 8 +- third_party/aom/aom_dsp/entenc.c | 192 +- third_party/aom/aom_dsp/entenc.h | 6 - third_party/aom/aom_dsp/fastssim.c | 42 +- third_party/aom/aom_dsp/fft.c | 219 ++ third_party/aom/aom_dsp/fft_common.h | 1050 ++++++ third_party/aom/aom_dsp/fwd_txfm.c | 668 +--- third_party/aom/aom_dsp/fwd_txfm.h | 24 - third_party/aom/aom_dsp/grain_synthesis.c | 1392 ++++++++ third_party/aom/aom_dsp/grain_synthesis.h | 118 + third_party/aom/aom_dsp/grain_table.c | 333 ++ third_party/aom/aom_dsp/grain_table.h | 102 + third_party/aom/aom_dsp/intrapred.c | 816 ++--- third_party/aom/aom_dsp/intrapred_common.h | 10 +- third_party/aom/aom_dsp/inv_txfm.c | 1482 --------- third_party/aom/aom_dsp/inv_txfm.h | 101 - third_party/aom/aom_dsp/loopfilter.c | 615 +--- third_party/aom/aom_dsp/mips/add_noise_msa.c | 3 +- .../aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c | 704 ---- .../aom/aom_dsp/mips/aom_convolve8_avg_msa.c | 605 ---- .../aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c | 677 ---- .../aom/aom_dsp/mips/aom_convolve8_horiz_msa.c | 4 +- third_party/aom/aom_dsp/mips/aom_convolve8_msa.c | 630 ---- .../aom/aom_dsp/mips/aom_convolve8_vert_msa.c | 4 +- .../aom/aom_dsp/mips/aom_convolve_avg_msa.c | 233 -- third_party/aom/aom_dsp/mips/aom_convolve_msa.h | 45 - third_party/aom/aom_dsp/mips/common_dspr2.h | 4 +- third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c | 256 -- .../aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c | 802 ----- third_party/aom/aom_dsp/mips/convolve2_dspr2.c | 3 +- .../aom/aom_dsp/mips/convolve2_horiz_dspr2.c | 3 +- .../aom/aom_dsp/mips/convolve2_vert_dspr2.c | 3 +- third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c | 646 ---- .../aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c | 998 ------ third_party/aom/aom_dsp/mips/convolve8_dspr2.c | 1379 +------- .../aom/aom_dsp/mips/convolve8_horiz_dspr2.c | 3 +- .../aom/aom_dsp/mips/convolve8_vert_dspr2.c | 3 +- .../aom/aom_dsp/mips/convolve_common_dspr2.h | 15 +- third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c | 928 ------ third_party/aom/aom_dsp/mips/fwd_txfm_msa.c | 238 -- third_party/aom/aom_dsp/mips/fwd_txfm_msa.h | 381 --- third_party/aom/aom_dsp/mips/idct16x16_msa.c | 486 --- third_party/aom/aom_dsp/mips/idct32x32_msa.c | 730 ---- third_party/aom/aom_dsp/mips/idct4x4_msa.c | 99 - third_party/aom/aom_dsp/mips/idct8x8_msa.c | 117 - third_party/aom/aom_dsp/mips/intrapred_msa.c | 3 +- third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h | 82 - third_party/aom/aom_dsp/mips/inv_txfm_msa.h | 412 --- third_party/aom/aom_dsp/mips/itrans16_dspr2.c | 1190 ------- third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c | 1042 ------ third_party/aom/aom_dsp/mips/itrans32_dspr2.c | 1030 ------ third_party/aom/aom_dsp/mips/itrans4_dspr2.c | 342 -- third_party/aom/aom_dsp/mips/itrans8_dspr2.c | 645 ---- third_party/aom/aom_dsp/mips/loopfilter_16_msa.c | 23 +- .../aom/aom_dsp/mips/loopfilter_filters_dspr2.c | 3 +- .../aom/aom_dsp/mips/loopfilter_filters_dspr2.h | 3 +- .../aom/aom_dsp/mips/loopfilter_macros_dspr2.h | 3 +- .../aom/aom_dsp/mips/loopfilter_masks_dspr2.h | 3 +- third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c | 3 +- .../aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c | 12 +- .../aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c | 3 +- third_party/aom/aom_dsp/mips/macros_msa.h | 3 +- third_party/aom/aom_dsp/mips/sad_msa.c | 733 +--- .../aom/aom_dsp/mips/sub_pixel_variance_msa.c | 3 +- third_party/aom/aom_dsp/mips/subtract_msa.c | 3 +- third_party/aom/aom_dsp/mips/txfm_macros_msa.h | 97 - third_party/aom/aom_dsp/mips/variance_msa.c | 3 +- third_party/aom/aom_dsp/noise_model.c | 1460 ++++++++ third_party/aom/aom_dsp/noise_model.h | 286 ++ third_party/aom/aom_dsp/noise_util.c | 221 ++ third_party/aom/aom_dsp/noise_util.h | 68 + third_party/aom/aom_dsp/prob.c | 217 -- third_party/aom/aom_dsp/prob.h | 769 ++++- third_party/aom/aom_dsp/psnr.c | 41 +- third_party/aom/aom_dsp/psnr.h | 16 +- third_party/aom/aom_dsp/psnrhvs.c | 58 +- third_party/aom/aom_dsp/quantize.c | 73 +- third_party/aom/aom_dsp/quantize.h | 67 +- third_party/aom/aom_dsp/sad.c | 409 +-- third_party/aom/aom_dsp/sad_av1.c | 248 ++ third_party/aom/aom_dsp/simd/v128_intrinsics.h | 80 +- third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h | 350 +- third_party/aom/aom_dsp/simd/v128_intrinsics_c.h | 241 +- third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h | 187 +- third_party/aom/aom_dsp/simd/v256_intrinsics.h | 90 +- third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h | 2 +- third_party/aom/aom_dsp/simd/v256_intrinsics_c.h | 251 +- .../aom/aom_dsp/simd/v256_intrinsics_v128.h | 656 +++- third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h | 386 ++- third_party/aom/aom_dsp/simd/v64_intrinsics.h | 9 +- third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h | 135 +- third_party/aom/aom_dsp/simd/v64_intrinsics_c.h | 61 +- third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h | 41 +- third_party/aom/aom_dsp/ssim.c | 13 +- third_party/aom/aom_dsp/ssim.h | 5 +- third_party/aom/aom_dsp/subtract.c | 6 +- third_party/aom/aom_dsp/sum_squares.c | 2 +- third_party/aom/aom_dsp/txfm_common.h | 667 +--- third_party/aom/aom_dsp/variance.c | 915 +++-- third_party/aom/aom_dsp/variance.h | 36 +- third_party/aom/aom_dsp/x86/aom_asm_stubs.c | 110 +- .../aom/aom_dsp/x86/aom_convolve_copy_sse2.asm | 48 - .../aom/aom_dsp/x86/aom_convolve_hip_sse2.c | 195 -- .../aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm | 356 +- .../x86/aom_high_subpixel_bilinear_sse2.asm | 163 +- .../aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c | 203 -- .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c | 31 +- .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c | 616 +--- .../aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm | 379 +-- .../aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm | 19 - .../aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm | 160 +- .../aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm | 158 +- third_party/aom/aom_dsp/x86/avg_intrin_sse2.c | 380 --- third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm | 124 - third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c | 12 +- third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c | 213 +- third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c | 40 +- third_party/aom/aom_dsp/x86/blend_sse4.h | 2 - third_party/aom/aom_dsp/x86/common_avx2.h | 2 +- third_party/aom/aom_dsp/x86/convolve.h | 136 +- third_party/aom/aom_dsp/x86/convolve_avx2.h | 198 ++ .../aom/aom_dsp/x86/convolve_common_intrin.h | 31 + third_party/aom/aom_dsp/x86/convolve_sse2.h | 121 + third_party/aom/aom_dsp/x86/convolve_sse4_1.h | 53 + third_party/aom/aom_dsp/x86/fft_avx2.c | 73 + third_party/aom/aom_dsp/x86/fft_sse2.c | 166 + third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c | 862 ----- .../aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h | 3022 ----------------- .../aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h | 3201 ------------------ third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c | 24 - third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h | 35 - third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h | 674 +--- third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c | 69 +- third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h | 207 -- .../aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm | 429 ++- .../aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm | 2 + .../aom/aom_dsp/x86/halfpix_variance_sse2.c | 5 +- third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c | 611 ++-- .../aom/aom_dsp/x86/highbd_convolve_ssse3.c | 251 ++ .../aom/aom_dsp/x86/highbd_intrapred_avx2.c | 240 -- .../aom/aom_dsp/x86/highbd_intrapred_sse2.c | 274 +- .../aom/aom_dsp/x86/highbd_intrapred_ssse3.c | 521 --- .../aom/aom_dsp/x86/highbd_loopfilter_avx2.c | 835 +---- .../aom/aom_dsp/x86/highbd_loopfilter_sse2.c | 2040 ++++++++---- .../aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c | 3 +- third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm | 2 - third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm | 8 - .../x86/highbd_subpel_variance_impl_sse2.asm | 80 +- third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c | 18 +- .../aom/aom_dsp/x86/highbd_variance_impl_sse2.asm | 2 + third_party/aom/aom_dsp/x86/highbd_variance_sse2.c | 309 +- third_party/aom/aom_dsp/x86/highbd_variance_sse4.c | 4 +- third_party/aom/aom_dsp/x86/intrapred_avx2.c | 408 ++- third_party/aom/aom_dsp/x86/intrapred_sse2.c | 882 ++++- third_party/aom/aom_dsp/x86/intrapred_ssse3.c | 1405 ++++++-- .../aom/aom_dsp/x86/intrapred_ssse3_asm.asm | 410 --- third_party/aom/aom_dsp/x86/inv_txfm_avx2.c | 1238 ------- third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h | 80 - third_party/aom/aom_dsp/x86/inv_txfm_sse2.c | 3500 -------------------- third_party/aom/aom_dsp/x86/inv_txfm_sse2.h | 265 -- third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c | 1333 -------- third_party/aom/aom_dsp/x86/inv_wht_sse2.asm | 5 - third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c | 238 ++ third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c | 298 ++ third_party/aom/aom_dsp/x86/loopfilter_avx2.c | 13 +- third_party/aom/aom_dsp/x86/loopfilter_sse2.c | 3210 ++++++++---------- third_party/aom/aom_dsp/x86/lpf_common_sse2.h | 305 +- .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.c | 24 +- .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.c | 65 +- .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.h | 92 + third_party/aom/aom_dsp/x86/mem_sse2.h | 42 + third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h | 2 +- third_party/aom/aom_dsp/x86/obmc_sad_sse4.c | 47 +- third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | 69 +- .../aom/aom_dsp/x86/quantize_avx_x86_64.asm | 83 +- third_party/aom/aom_dsp/x86/quantize_sse2.c | 3 +- .../aom/aom_dsp/x86/quantize_ssse3_x86_64.asm | 63 +- third_party/aom/aom_dsp/x86/sad4d_avx2.c | 4 +- third_party/aom/aom_dsp/x86/sad4d_sse2.asm | 4 - third_party/aom/aom_dsp/x86/sad_avx2.c | 4 +- third_party/aom/aom_dsp/x86/sad_highbd_avx2.c | 18 +- third_party/aom/aom_dsp/x86/sad_impl_avx2.c | 3 +- third_party/aom/aom_dsp/x86/sad_sse2.asm | 14 - third_party/aom/aom_dsp/x86/sad_sse3.asm | 377 --- third_party/aom/aom_dsp/x86/sad_sse4.asm | 362 -- third_party/aom/aom_dsp/x86/sad_ssse3.asm | 373 --- third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm | 3 + .../aom/aom_dsp/x86/subpel_variance_sse2.asm | 88 +- third_party/aom/aom_dsp/x86/subtract_sse2.asm | 4 - third_party/aom/aom_dsp/x86/sum_squares_sse2.c | 143 +- third_party/aom/aom_dsp/x86/synonyms.h | 33 +- third_party/aom/aom_dsp/x86/synonyms_avx2.h | 64 + third_party/aom/aom_dsp/x86/transpose_sse2.h | 420 +++ third_party/aom/aom_dsp/x86/txfm_common_avx2.h | 78 - third_party/aom/aom_dsp/x86/txfm_common_intrin.h | 31 - third_party/aom/aom_dsp/x86/txfm_common_sse2.h | 301 +- third_party/aom/aom_dsp/x86/variance_avx2.c | 503 ++- third_party/aom/aom_dsp/x86/variance_impl_avx2.c | 200 +- third_party/aom/aom_dsp/x86/variance_sse2.c | 961 +++--- 281 files changed, 21378 insertions(+), 63336 deletions(-) delete mode 100644 third_party/aom/aom_dsp/ans.h delete mode 100644 third_party/aom/aom_dsp/ansreader.h delete mode 100644 third_party/aom/aom_dsp/answriter.h delete mode 100644 third_party/aom/aom_dsp/aom_dsp.mk delete mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/aom_convolve_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/avg_neon.c create mode 100644 third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/hadamard_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct16x16_add_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct16x16_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct32x32_add_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct4x4_add_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/idct8x8_add_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_16_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_4_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_8_neon.c delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm delete mode 100644 third_party/aom/aom_dsp/arm/save_reg_neon.asm delete mode 100644 third_party/aom/aom_dsp/avg.c create mode 100644 third_party/aom/aom_dsp/fft.c create mode 100644 third_party/aom/aom_dsp/fft_common.h delete mode 100644 third_party/aom/aom_dsp/fwd_txfm.h create mode 100644 third_party/aom/aom_dsp/grain_synthesis.c create mode 100644 third_party/aom/aom_dsp/grain_synthesis.h create mode 100644 third_party/aom/aom_dsp/grain_table.c create mode 100644 third_party/aom/aom_dsp/grain_table.h delete mode 100644 third_party/aom/aom_dsp/inv_txfm.c delete mode 100644 third_party/aom/aom_dsp/inv_txfm.h delete mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c delete mode 100644 third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c delete mode 100644 third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c delete mode 100644 third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c delete mode 100644 third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/fwd_txfm_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/fwd_txfm_msa.h delete mode 100644 third_party/aom/aom_dsp/mips/idct16x16_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/idct32x32_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/idct4x4_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/idct8x8_msa.c delete mode 100644 third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h delete mode 100644 third_party/aom/aom_dsp/mips/inv_txfm_msa.h delete mode 100644 third_party/aom/aom_dsp/mips/itrans16_dspr2.c delete mode 100644 third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c delete mode 100644 third_party/aom/aom_dsp/mips/itrans32_dspr2.c delete mode 100644 third_party/aom/aom_dsp/mips/itrans4_dspr2.c delete mode 100644 third_party/aom/aom_dsp/mips/itrans8_dspr2.c delete mode 100644 third_party/aom/aom_dsp/mips/txfm_macros_msa.h create mode 100644 third_party/aom/aom_dsp/noise_model.c create mode 100644 third_party/aom/aom_dsp/noise_model.h create mode 100644 third_party/aom/aom_dsp/noise_util.c create mode 100644 third_party/aom/aom_dsp/noise_util.h delete mode 100644 third_party/aom/aom_dsp/prob.c create mode 100644 third_party/aom/aom_dsp/sad_av1.c delete mode 100644 third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c delete mode 100644 third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c delete mode 100644 third_party/aom/aom_dsp/x86/avg_intrin_sse2.c delete mode 100644 third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm create mode 100644 third_party/aom/aom_dsp/x86/convolve_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/convolve_common_intrin.h create mode 100644 third_party/aom/aom_dsp/x86/convolve_sse2.h create mode 100644 third_party/aom/aom_dsp/x86/convolve_sse4_1.h create mode 100644 third_party/aom/aom_dsp/x86/fft_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/fft_sse2.c delete mode 100644 third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c delete mode 100644 third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h delete mode 100644 third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h delete mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c delete mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c delete mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c delete mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm delete mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_avx2.c delete mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h delete mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_sse2.c delete mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_sse2.h delete mode 100644 third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c create mode 100644 third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h create mode 100644 third_party/aom/aom_dsp/x86/mem_sse2.h delete mode 100644 third_party/aom/aom_dsp/x86/sad_sse3.asm delete mode 100644 third_party/aom/aom_dsp/x86/sad_sse4.asm delete mode 100644 third_party/aom/aom_dsp/x86/sad_ssse3.asm create mode 100644 third_party/aom/aom_dsp/x86/synonyms_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/transpose_sse2.h delete mode 100644 third_party/aom/aom_dsp/x86/txfm_common_avx2.h delete mode 100644 third_party/aom/aom_dsp/x86/txfm_common_intrin.h (limited to 'third_party/aom/aom_dsp') diff --git a/third_party/aom/aom_dsp/add_noise.c b/third_party/aom/aom_dsp/add_noise.c index 389cf2049..bfb3e7e00 100644 --- a/third_party/aom/aom_dsp/add_noise.c +++ b/third_party/aom/aom_dsp/add_noise.c @@ -12,8 +12,8 @@ #include #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/ans.h b/third_party/aom/aom_dsp/ans.h deleted file mode 100644 index a7a2f0eab..000000000 --- a/third_party/aom/aom_dsp/ans.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_ANS_H_ -#define AOM_DSP_ANS_H_ -// Constants, types and utilities for Asymmetric Numeral Systems -// http://arxiv.org/abs/1311.2540v2 - -#include -#include "./aom_config.h" -#include "aom/aom_integer.h" -#include "aom_dsp/prob.h" - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -// Use windowed ANS, size is passed in at initialization -#define ANS_MAX_SYMBOLS 1 -#define ANS_REVERSE 1 - -typedef uint8_t AnsP8; -#define ANS_P8_PRECISION 256u -#define ANS_P8_SHIFT 8 -#define RANS_PROB_BITS 15 -#define RANS_PRECISION (1u << RANS_PROB_BITS) - -// L_BASE is the ANS base state. L_BASE % PRECISION must be 0. -#define L_BASE (1u << 17) -#define IO_BASE 256 -// Range I = { L_BASE, L_BASE + 1, ..., L_BASE * IO_BASE - 1 } - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus -#endif // AOM_DSP_ANS_H_ diff --git a/third_party/aom/aom_dsp/ansreader.h b/third_party/aom/aom_dsp/ansreader.h deleted file mode 100644 index e50c63b2d..000000000 --- a/third_party/aom/aom_dsp/ansreader.h +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_ANSREADER_H_ -#define AOM_DSP_ANSREADER_H_ -// An implementation of Asymmetric Numeral Systems -// http://arxiv.org/abs/1311.2540v2 -// Implements decoding of: -// * rABS (range Asymmetric Binary Systems), a boolean coder -// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder - -#include -#include "./aom_config.h" -#include "aom/aom_integer.h" -#include "aom_dsp/prob.h" -#include "aom_dsp/ans.h" -#include "aom_ports/mem_ops.h" -#if CONFIG_ACCOUNTING -#include "av1/decoder/accounting.h" -#endif - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -struct AnsDecoder { - const uint8_t *buf; - int buf_offset; - uint32_t state; -#if ANS_MAX_SYMBOLS - int symbols_left; - int window_size; -#endif -#if CONFIG_ACCOUNTING - Accounting *accounting; -#endif -}; - -static INLINE int ans_read_reinit(struct AnsDecoder *const ans); - -static INLINE unsigned refill_state(struct AnsDecoder *const ans, - unsigned state) { -#if ANS_REVERSE - while (state < L_BASE && ans->buf_offset < 0) { - state = state * IO_BASE + ans->buf[ans->buf_offset++]; - } -#else - while (state < L_BASE && ans->buf_offset > 0) { - state = state * IO_BASE + ans->buf[--ans->buf_offset]; - } -#endif - return state; -} - -// Decode one rABS encoded boolean where the probability of the value being zero -// is p0. -static INLINE int rabs_read(struct AnsDecoder *ans, AnsP8 p0) { -#if ANS_MAX_SYMBOLS - if (ans->symbols_left-- == 0) { - ans_read_reinit(ans); - ans->symbols_left--; - } -#endif - unsigned state = refill_state(ans, ans->state); - const unsigned quotient = state / ANS_P8_PRECISION; - const unsigned remainder = state % ANS_P8_PRECISION; - const int value = remainder >= p0; - const unsigned qp0 = quotient * p0; - if (value) - state = state - qp0 - p0; - else - state = qp0 + remainder; - ans->state = state; - return value; -} - -// Decode one rABS encoded boolean where the probability of the value being zero -// is one half. -static INLINE int rabs_read_bit(struct AnsDecoder *ans) { -#if ANS_MAX_SYMBOLS - if (ans->symbols_left-- == 0) { - ans_read_reinit(ans); - ans->symbols_left--; - } -#endif - unsigned state = refill_state(ans, ans->state); - const int value = !!(state & 0x80); - ans->state = ((state >> 1) & ~0x7F) | (state & 0x7F); - return value; -} - -struct rans_dec_sym { - uint8_t val; - aom_cdf_prob prob; - aom_cdf_prob cum_prob; // not-inclusive -}; - -static INLINE void fetch_sym(struct rans_dec_sym *out, const aom_cdf_prob *cdf, - aom_cdf_prob rem) { - int i; - aom_cdf_prob cum_prob = 0, top_prob; - // TODO(skal): if critical, could be a binary search. - // Or, better, an O(1) alias-table. - for (i = 0; rem >= (top_prob = cdf[i]); ++i) { - cum_prob = top_prob; - } - out->val = i; - out->prob = top_prob - cum_prob; - out->cum_prob = cum_prob; -} - -static INLINE int rans_read(struct AnsDecoder *ans, const aom_cdf_prob *tab) { - unsigned rem; - unsigned quo; - struct rans_dec_sym sym; -#if ANS_MAX_SYMBOLS - if (ans->symbols_left-- == 0) { - ans_read_reinit(ans); - ans->symbols_left--; - } -#endif - ans->state = refill_state(ans, ans->state); - quo = ans->state / RANS_PRECISION; - rem = ans->state % RANS_PRECISION; - fetch_sym(&sym, tab, rem); - ans->state = quo * sym.prob + rem - sym.cum_prob; - return sym.val; -} - -static INLINE int ans_read_init(struct AnsDecoder *const ans, - const uint8_t *const buf, int offset) { - unsigned x; - if (offset < 1) return 1; -#if ANS_REVERSE - ans->buf = buf + offset; - ans->buf_offset = -offset; - x = buf[0]; - if ((x & 0x80) == 0) { // Marker is 0xxx xxxx - if (offset < 2) return 1; - ans->buf_offset += 2; - ans->state = mem_get_be16(buf) & 0x7FFF; -#if L_BASE * IO_BASE > (1 << 23) - } else if ((x & 0xC0) == 0x80) { // Marker is 10xx xxxx - if (offset < 3) return 1; - ans->buf_offset += 3; - ans->state = mem_get_be24(buf) & 0x3FFFFF; - } else { // Marker is 11xx xxxx - if (offset < 4) return 1; - ans->buf_offset += 4; - ans->state = mem_get_be32(buf) & 0x3FFFFFFF; -#else - } else { // Marker is 1xxx xxxx - if (offset < 3) return 1; - ans->buf_offset += 3; - ans->state = mem_get_be24(buf) & 0x7FFFFF; -#endif - } -#else - ans->buf = buf; - x = buf[offset - 1]; - if ((x & 0x80) == 0) { // Marker is 0xxx xxxx - if (offset < 2) return 1; - ans->buf_offset = offset - 2; - ans->state = mem_get_le16(buf + offset - 2) & 0x7FFF; - } else if ((x & 0xC0) == 0x80) { // Marker is 10xx xxxx - if (offset < 3) return 1; - ans->buf_offset = offset - 3; - ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF; - } else if ((x & 0xE0) == 0xE0) { // Marker is 111x xxxx - if (offset < 4) return 1; - ans->buf_offset = offset - 4; - ans->state = mem_get_le32(buf + offset - 4) & 0x1FFFFFFF; - } else { - // Marker 110x xxxx implies this byte is a superframe marker - return 1; - } -#endif // ANS_REVERSE -#if CONFIG_ACCOUNTING - ans->accounting = NULL; -#endif - ans->state += L_BASE; - if (ans->state >= L_BASE * IO_BASE) return 1; -#if ANS_MAX_SYMBOLS - assert(ans->window_size > 1); - ans->symbols_left = ans->window_size; -#endif - return 0; -} - -#if ANS_REVERSE -static INLINE int ans_read_reinit(struct AnsDecoder *const ans) { - return ans_read_init(ans, ans->buf + ans->buf_offset, -ans->buf_offset); -} -#endif - -static INLINE int ans_read_end(const struct AnsDecoder *const ans) { - return ans->buf_offset == 0 && ans->state < L_BASE; -} - -static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) { - return ans->state < L_BASE / RANS_PRECISION; -} -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus -#endif // AOM_DSP_ANSREADER_H_ diff --git a/third_party/aom/aom_dsp/answriter.h b/third_party/aom/aom_dsp/answriter.h deleted file mode 100644 index 353acf1a9..000000000 --- a/third_party/aom/aom_dsp/answriter.h +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_ANSWRITER_H_ -#define AOM_DSP_ANSWRITER_H_ -// An implementation of Asymmetric Numeral Systems -// http://arxiv.org/abs/1311.2540v2 -// Implements encoding of: -// * rABS (range Asymmetric Binary Systems), a boolean coder -// * rANS (range Asymmetric Numeral Systems), a multi-symbol coder - -#include -#include "./aom_config.h" -#include "aom/aom_integer.h" -#include "aom_dsp/ans.h" -#include "aom_dsp/prob.h" -#include "aom_ports/mem_ops.h" -#include "av1/common/odintrin.h" - -#if RANS_PRECISION <= OD_DIVU_DMAX -#define ANS_DIVREM(quotient, remainder, dividend, divisor) \ - do { \ - quotient = OD_DIVU_SMALL((dividend), (divisor)); \ - remainder = (dividend) - (quotient) * (divisor); \ - } while (0) -#else -#define ANS_DIVREM(quotient, remainder, dividend, divisor) \ - do { \ - quotient = (dividend) / (divisor); \ - remainder = (dividend) % (divisor); \ - } while (0) -#endif - -#define ANS_DIV8(dividend, divisor) OD_DIVU_SMALL((dividend), (divisor)) - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -struct AnsCoder { - uint8_t *buf; - int buf_offset; - uint32_t state; -}; - -static INLINE void ans_write_init(struct AnsCoder *const ans, - uint8_t *const buf) { - ans->buf = buf; - ans->buf_offset = 0; - ans->state = L_BASE; -} - -static INLINE int ans_write_end(struct AnsCoder *const ans) { - uint32_t state; - int ans_size; - assert(ans->state >= L_BASE); - assert(ans->state < L_BASE * IO_BASE); - state = ans->state - L_BASE; - if (state < (1u << 15)) { - mem_put_le16(ans->buf + ans->buf_offset, (0x00u << 15) + state); - ans_size = ans->buf_offset + 2; -#if ANS_REVERSE -#if L_BASE * IO_BASE > (1 << 23) - } else if (state < (1u << 22)) { - mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state); - ans_size = ans->buf_offset + 3; - } else if (state < (1u << 30)) { - mem_put_le32(ans->buf + ans->buf_offset, (0x03u << 30) + state); - ans_size = ans->buf_offset + 4; -#else - } else if (state < (1u << 23)) { - mem_put_le24(ans->buf + ans->buf_offset, (0x01u << 23) + state); - ans_size = ans->buf_offset + 3; -#endif -#else - } else if (state < (1u << 22)) { - mem_put_le24(ans->buf + ans->buf_offset, (0x02u << 22) + state); - ans_size = ans->buf_offset + 3; - } else if (state < (1u << 29)) { - mem_put_le32(ans->buf + ans->buf_offset, (0x07u << 29) + state); - ans_size = ans->buf_offset + 4; -#endif - } else { - assert(0 && "State is too large to be serialized"); - return ans->buf_offset; - } -#if ANS_REVERSE - { - int i; - uint8_t tmp; - for (i = 0; i < (ans_size >> 1); i++) { - tmp = ans->buf[i]; - ans->buf[i] = ans->buf[ans_size - 1 - i]; - ans->buf[ans_size - 1 - i] = tmp; - } - ans->buf += ans_size; - ans->buf_offset = 0; - ans->state = L_BASE; - } -#endif - return ans_size; -} - -// Write one boolean using rABS where p0 is the probability of the value being -// zero. -static INLINE void rabs_write(struct AnsCoder *ans, int value, AnsP8 p0) { - const AnsP8 p = ANS_P8_PRECISION - p0; - const unsigned l_s = value ? p : p0; - unsigned state = ans->state; - while (state >= L_BASE / ANS_P8_PRECISION * IO_BASE * l_s) { - ans->buf[ans->buf_offset++] = state % IO_BASE; - state /= IO_BASE; - } - const unsigned quotient = ANS_DIV8(state, l_s); - const unsigned remainder = state - quotient * l_s; - ans->state = quotient * ANS_P8_PRECISION + remainder + (value ? p0 : 0); -} - -// Encode one symbol using rANS. -// cum_prob: The cumulative probability before this symbol (the offset of -// the symbol in the symbol cycle) -// prob: The probability of this symbol (l_s from the paper) -// RANS_PRECISION takes the place of m from the paper. -static INLINE void rans_write(struct AnsCoder *ans, aom_cdf_prob cum_prob, - aom_cdf_prob prob) { - unsigned quotient, remainder; - while (ans->state >= L_BASE / RANS_PRECISION * IO_BASE * prob) { - ans->buf[ans->buf_offset++] = ans->state % IO_BASE; - ans->state /= IO_BASE; - } - ANS_DIVREM(quotient, remainder, ans->state, prob); - ans->state = quotient * RANS_PRECISION + remainder + cum_prob; -} - -#undef ANS_DIV8 -#undef ANS_DIVREM -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus -#endif // AOM_DSP_ANSWRITER_H_ diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c index c903ea52d..bba37e227 100644 --- a/third_party/aom/aom_dsp/aom_convolve.c +++ b/third_party/aom/aom_dsp/aom_convolve.c @@ -12,73 +12,40 @@ #include #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" +static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; +} + +static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; + return sum; +} + static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h) { - int x, y; src -= SUBPEL_TAPS / 2 - 1; - for (y = 0; y < h; ++y) { + for (int y = 0; y < h; ++y) { int x_q4 = x0_q4; - for (x = 0; x < w; ++x) { + for (int x = 0; x < w; ++x) { const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; - dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - x_q4 += x_step_q4; - } - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_qn, - int x_step_qn, int w, int h) { - int x, y; - src -= SUBPEL_TAPS / 2 - 1; - for (y = 0; y < h; ++y) { - int x_qn = x0_qn; - for (x = 0; x < w; ++x) { - const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS]; // q8 - const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(x_filter_idx < SUBPEL_SHIFTS); - const int16_t *const x_filter = x_filters[x_filter_idx]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + const int sum = horz_scalar_product(src_x, x_filter); dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - x_qn += x_step_qn; - } - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { - int x, y; - src -= SUBPEL_TAPS / 2 - 1; - for (y = 0; y < h; ++y) { - int x_q4 = x0_q4; - for (x = 0; x < w; ++x) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; - dst[x] = ROUND_POWER_OF_TWO( - dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); x_q4 += x_step_q4; } src += src_stride; @@ -86,97 +53,19 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_qn, - int x_step_qn, int w, int h) { - int x, y; - src -= SUBPEL_TAPS / 2 - 1; - for (y = 0; y < h; ++y) { - int x_qn = x0_qn; - for (x = 0; x < w; ++x) { - const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS]; - const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS; - assert(x_filter_idx < SUBPEL_SHIFTS); - const int16_t *const x_filter = x_filters[x_filter_idx]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; - dst[x] = ROUND_POWER_OF_TWO( - dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); - x_qn += x_step_qn; - } - src += src_stride; - dst += dst_stride; - } -} - static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h) { - int x, y; src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (x = 0; x < w; ++x) { + for (int x = 0; x < w; ++x) { int y_q4 = y0_q4; - for (y = 0; y < h; ++y) { + for (int y = 0; y < h; ++y) { const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - y_q4 += y_step_q4; - } - ++src; - ++dst; - } -} - -static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_qn, - int y_step_qn, int w, int h) { - int x, y; - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - - for (x = 0; x < w; ++x) { - int y_qn = y0_qn; - for (y = 0; y < h; ++y) { - const unsigned char *src_y = - &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = - y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_y[k * src_stride] * y_filter[k]; + const int sum = vert_scalar_product(src_y, src_stride, y_filter); dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - y_qn += y_step_qn; - } - ++src; - ++dst; - } -} - -static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { - int x, y; - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - - for (x = 0; x < w; ++x) { - int y_q4 = y0_q4; - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = ROUND_POWER_OF_TWO( - dst[y * dst_stride] + - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), - 1); y_q4 += y_step_q4; } ++src; @@ -184,103 +73,6 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, } } -static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_qn, - int y_step_qn, int w, int h) { - int x, y; - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - - for (x = 0; x < w; ++x) { - int y_qn = y0_qn; - for (y = 0; y < h; ++y) { - const unsigned char *src_y = - &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = - y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = ROUND_POWER_OF_TWO( - dst[y * dst_stride] + - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), - 1); - y_qn += y_step_qn; - } - ++src; - ++dst; - } -} - -static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const InterpKernel *const x_filters, - int x0_q4, int x_step_q4, - const InterpKernel *const y_filters, int y0_q4, - int y_step_q4, int w, int h) { - // Note: Fixed size intermediate buffer, temp, places limits on parameters. - // 2d filtering proceeds in 2 steps: - // (1) Interpolate horizontally into an intermediate buffer, temp. - // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): - // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). - // --Largest block size is 64x64 pixels. - // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the - // original frame (in 1/16th pixel units). - // --Must round-up because block may be located at sub-pixel position. - // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. - // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; - int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, - MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, - intermediate_height); - convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); -} - -static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_qn, - int x_step_qn, const InterpKernel *const y_filters, - int y0_qn, int y_step_qn, int w, int h) { - // TODO(afergs): Update comment here - // Note: Fixed size intermediate buffer, temp, places limits on parameters. - // 2d filtering proceeds in 2 steps: - // (1) Interpolate horizontally into an intermediate buffer, temp. - // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): - // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative). - // --Largest block size is 64x64 pixels. - // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the - // original frame (in 1/16th pixel units). - // --Must round-up because block may be located at sub-pixel position. - // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. - // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; - int intermediate_height = - (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - - assert(y_step_qn <= SCALE_SUBPEL_BITS * 2); - assert(x_step_qn <= SCALE_SUBPEL_BITS * 2); - - convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w, - intermediate_height); - convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, - dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h); -} - static const InterpKernel *get_filter_base(const int16_t *filter) { // NOTE: This assumes that the filter table is 256-byte aligned. // TODO(agrange) Modify to make independent of table alignment. @@ -306,52 +98,6 @@ void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, w, h); } -void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int subpel_x, - int x_step_qn, const int16_t *filter_y, - int subpel_y, int y_step_qn, int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - - (void)subpel_y; - (void)filter_y; - (void)y_step_qn; - - convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x, - x_step_qn, w, h); -} - -void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; - (void)y_step_q4; - - convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h); -} - -void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int subpel_x, - int x_step_qn, const int16_t *filter_y, - int subpel_y, int y_step_qn, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - - (void)subpel_y; - (void)filter_y; - (void)y_step_qn; - - convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, - subpel_x, x_step_qn, w, h); -} - void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -367,109 +113,6 @@ void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, w, h); } -void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int subpel_x, - int x_step_qn, const int16_t *filter_y, - int subpel_y, int y_step_qn, int w, int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - - (void)subpel_x; - (void)filter_x; - (void)x_step_qn; - - convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y, - y_step_qn, w, h); -} - -void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; - (void)x_step_q4; - - convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h); -} - -void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int subpel_x, - int x_step_qn, const int16_t *filter_y, - int subpel_y, int y_step_qn, int w, int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - - (void)subpel_x; - (void)filter_x; - (void)x_step_qn; - - convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, - subpel_y, y_step_qn, w, h); -} - -void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h); -} - -void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int subpel_x, int x_step_qn, - const int16_t *filter_y, int subpel_y, int y_step_qn, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - - convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x, - x_step_qn, filters_y, subpel_y, y_step_qn, w, h); -} - -void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - /* Fixed size intermediate buffer places limits on parameters. */ - DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]); - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - - aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w, - h); -} - -void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int subpel_x, - int x_step_qn, const int16_t *filter_y, - int subpel_y, int y_step_qn, int w, int h) { - /* Fixed size intermediate buffer places limits on parameters. */ - DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]); - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - - aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x, - x_step_qn, filter_y, subpel_y, y_step_qn, w, h); - aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w, - h); -} - void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, @@ -488,330 +131,34 @@ void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } } -void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int filter_x_stride, const int16_t *filter_y, - int filter_y_stride, int w, int h) { - int x, y; - - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; - - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); - - src += src_stride; - dst += dst_stride; - } -} - -void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); -} - -void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); -} - -void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); -} - -void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); -} - -void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); -} - -void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); -} - -// TODO(afergs): Make sure this works too -#if CONFIG_LOOP_RESTORATION -static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { - int x, y, k; - src -= SUBPEL_TAPS / 2 - 1; - for (y = 0; y < h; ++y) { - int x_q4 = x0_q4; - for (x = 0; x < w; ++x) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - int sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; - dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + - src_x[SUBPEL_TAPS / 2 - 1]); - x_q4 += x_step_q4; - } - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { - int x, y, k; - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - - for (x = 0; x < w; ++x) { - int y_q4 = y0_q4; - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - int sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + - src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]); - y_q4 += y_step_q4; - } - ++src; - ++dst; - } -} - -static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h) { - uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; - int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, - intermediate_height); - convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, - dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h); -} - -void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; - (void)y_step_q4; - - convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h); -} - -void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; - (void)x_step_q4; - - convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h); -} - -void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, filters_y, y0_q4, y_step_q4, w, h); -} - -static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride, - uint16_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { - const int bd = 8; - int x, y, k; - src -= SUBPEL_TAPS / 2 - 1; - for (y = 0; y < h; ++y) { - int x_q4 = x0_q4; - for (x = 0; x < w; ++x) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + - (1 << (bd + FILTER_BITS - 1)); - for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; - dst[x] = - (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS), - 0, EXTRAPREC_CLAMP_LIMIT(bd) - 1); - x_q4 += x_step_q4; - } - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { - const int bd = 8; - int x, y, k; - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - - for (x = 0; x < w; ++x) { - int y_q4 = y0_q4; - for (y = 0; y < h; ++y) { - const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - int sum = - ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - - (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)); - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = - clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS)); - y_q4 += y_step_q4; - } - ++src; - ++dst; - } -} - -static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, - const InterpKernel *const y_filters, int y0_q4, - int y_step_q4, int w, int h) { - uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; - int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, - x_step_q4, w, intermediate_height); - convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, - y_step_q4, w, h); -} - -void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride, - uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; - (void)y_step_q4; - - convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h); -} - -void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; - (void)x_step_q4; - - convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h); +static INLINE int highbd_vert_scalar_product(const uint16_t *a, + ptrdiff_t a_stride, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k]; + return sum; } -void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, filters_y, y0_q4, y_step_q4, w, h); +static INLINE int highbd_horz_scalar_product(const uint16_t *a, + const int16_t *b) { + int sum = 0; + for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k]; + return sum; } -#endif // CONFIG_LOOP_RESTORATION -// TODO(afergs): Make sure this works too -#if CONFIG_HIGHBITDEPTH static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { - int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; - for (y = 0; y < h; ++y) { + for (int y = 0; y < h; ++y) { int x_q4 = x0_q4; - for (x = 0; x < w; ++x) { + for (int x = 0; x < w; ++x) { const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + const int sum = highbd_horz_scalar_product(src_x, x_filter); dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); x_q4 += x_step_q4; } @@ -820,47 +167,19 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h, int bd) { - int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - src -= SUBPEL_TAPS / 2 - 1; - for (y = 0; y < h; ++y) { - int x_q4 = x0_q4; - for (x = 0; x < w; ++x) { - const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; - dst[x] = ROUND_POWER_OF_TWO( - dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), - 1); - x_q4 += x_step_q4; - } - src += src_stride; - dst += dst_stride; - } -} - static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { - int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (x = 0; x < w; ++x) { + for (int x = 0; x < w; ++x) { int y_q4 = y0_q4; - for (y = 0; y < h; ++y) { + for (int y = 0; y < h; ++y) { const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_y[k * src_stride] * y_filter[k]; + const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter); dst[y * dst_stride] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); y_q4 += y_step_q4; @@ -870,67 +189,6 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, } } -static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h, int bd) { - int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (x = 0; x < w; ++x) { - int y_q4 = y0_q4; - for (y = 0; y < h; ++y) { - const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - int k, sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = ROUND_POWER_OF_TWO( - dst[y * dst_stride] + - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), - 1); - y_q4 += y_step_q4; - } - ++src; - ++dst; - } -} - -static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h, int bd) { - // Note: Fixed size intermediate buffer, temp, places limits on parameters. - // 2d filtering proceeds in 2 steps: - // (1) Interpolate horizontally into an intermediate buffer, temp. - // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): - // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). - // --Largest block size is 64x64 pixels. - // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the - // original frame (in 1/16th pixel units). - // --Must round-up because block may be located at sub-pixel position. - // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. - // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; - int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4, - x_step_q4, w, intermediate_height, bd); - highbd_convolve_vert( - CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd); -} - void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -945,20 +203,6 @@ void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, x_step_q4, w, h, bd); } -void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - (void)filter_y; - (void)y_step_q4; - - highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h, bd); -} - void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -973,51 +217,6 @@ void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, y_step_q4, w, h, bd); } -void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - (void)filter_x; - (void)x_step_q4; - - highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h, bd); -} - -void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h, bd); -} - -void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h, int bd) { - // Fixed size intermediate buffer places limits on parameters. - DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]); - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - - aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, - filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); - aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst, - dst_stride, NULL, 0, NULL, 0, w, h, bd); -} - void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, @@ -1038,295 +237,3 @@ void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, dst += dst_stride; } } - -void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h, int bd) { - int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - (void)bd; - - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); - } - src += src_stride; - dst += dst_stride; - } -} - -#if CONFIG_LOOP_RESTORATION -static void highbd_convolve_add_src_horiz(const uint8_t *src8, - ptrdiff_t src_stride, uint8_t *dst8, - ptrdiff_t dst_stride, - const InterpKernel *x_filters, - int x0_q4, int x_step_q4, int w, - int h, int bd) { - int x, y, k; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - src -= SUBPEL_TAPS / 2 - 1; - for (y = 0; y < h; ++y) { - int x_q4 = x0_q4; - for (x = 0; x < w; ++x) { - const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - int sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; - dst[x] = clip_pixel_highbd( - ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1], - bd); - x_q4 += x_step_q4; - } - src += src_stride; - dst += dst_stride; - } -} - -static void highbd_convolve_add_src_vert(const uint8_t *src8, - ptrdiff_t src_stride, uint8_t *dst8, - ptrdiff_t dst_stride, - const InterpKernel *y_filters, - int y0_q4, int y_step_q4, int w, int h, - int bd) { - int x, y, k; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (x = 0; x < w; ++x) { - int y_q4 = y0_q4; - for (y = 0; y < h; ++y) { - const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - int sum = 0; - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = - clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) + - src_y[(SUBPEL_TAPS / 2 - 1) * src_stride], - bd); - y_q4 += y_step_q4; - } - ++src; - ++dst; - } -} - -static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, - int x0_q4, int x_step_q4, - const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h, - int bd) { - // Note: Fixed size intermediate buffer, temp, places limits on parameters. - // 2d filtering proceeds in 2 steps: - // (1) Interpolate horizontally into an intermediate buffer, temp. - // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): - // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). - // --Largest block size is 64x64 pixels. - // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the - // original frame (in 1/16th pixel units). - // --Must round-up because block may be located at sub-pixel position. - // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. - // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; - int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, CONVERT_TO_BYTEPTR(temp), - MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w, - intermediate_height, bd); - highbd_convolve_add_src_vert( - CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd); -} - -void aom_highbd_convolve8_add_src_horiz_c( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - (void)filter_y; - (void)y_step_q4; - - highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, - x0_q4, x_step_q4, w, h, bd); -} - -void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src, - ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - (void)filter_x; - (void)x_step_q4; - - highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, - y0_q4, y_step_q4, w, h, bd); -} - -void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd); -} - -static void highbd_convolve_add_src_horiz_hip( - const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst, - ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h, int bd) { - const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd); - int x, y, k; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - src -= SUBPEL_TAPS / 2 - 1; - for (y = 0; y < h; ++y) { - int x_q4 = x0_q4; - for (x = 0; x < w; ++x) { - const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) + - (1 << (bd + FILTER_BITS - 1)); - for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; - dst[x] = - (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS), - 0, extraprec_clamp_limit - 1); - x_q4 += x_step_q4; - } - src += src_stride; - dst += dst_stride; - } -} - -static void highbd_convolve_add_src_vert_hip( - const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8, - ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h, int bd) { - int x, y, k; - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (x = 0; x < w; ++x) { - int y_q4 = y0_q4; - for (y = 0; y < h; ++y) { - const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - int sum = - ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) - - (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)); - for (k = 0; k < SUBPEL_TAPS; ++k) - sum += src_y[k * src_stride] * y_filter[k]; - dst[y * dst_stride] = clip_pixel_highbd( - ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd); - y_q4 += y_step_q4; - } - ++src; - ++dst; - } -} - -static void highbd_convolve_add_src_hip( - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, int y0_q4, - int y_step_q4, int w, int h, int bd) { - // Note: Fixed size intermediate buffer, temp, places limits on parameters. - // 2d filtering proceeds in 2 steps: - // (1) Interpolate horizontally into an intermediate buffer, temp. - // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): - // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). - // --Largest block size is 64x64 pixels. - // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the - // original frame (in 1/16th pixel units). - // --Must round-up because block may be located at sub-pixel position. - // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. - // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE]; - int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - highbd_convolve_add_src_horiz_hip( - src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE, - x_filters, x0_q4, x_step_q4, w, intermediate_height, bd); - highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, dst, dst_stride, y_filters, - y0_q4, y_step_q4, w, h, bd); -} - -void aom_highbd_convolve8_add_src_horiz_hip_c( - const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - (void)filter_y; - (void)y_step_q4; - - highbd_convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, - x0_q4, x_step_q4, w, h, bd); -} - -void aom_highbd_convolve8_add_src_vert_hip_c( - const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - (void)filter_x; - (void)x_step_q4; - - highbd_convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, - y0_q4, y_step_q4, w, h, bd); -} - -void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src, - ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, - x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w, - h, bd); -} - -#endif // CONFIG_LOOP_RESTORATION -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/aom_convolve.h b/third_party/aom/aom_dsp/aom_convolve.h index c7943dced..6f5b888e4 100644 --- a/third_party/aom/aom_dsp/aom_convolve.h +++ b/third_party/aom/aom_dsp/aom_convolve.h @@ -11,7 +11,8 @@ #ifndef AOM_DSP_AOM_CONVOLVE_H_ #define AOM_DSP_AOM_CONVOLVE_H_ -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #ifdef __cplusplus @@ -30,16 +31,11 @@ extern "C" { // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. -#if CONFIG_AV1 && CONFIG_EXT_PARTITION +// TODO(wtc): Update the above comment to explain the value 263 used in aom. #define MAX_EXT_SIZE 263 -#else -#define MAX_EXT_SIZE 135 -#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION -#if CONFIG_AV1 && CONFIG_LOOP_RESTORATION #define EXTRAPREC_BITS 2 #define EXTRAPREC_CLAMP_LIMIT(bd) (1 << ((bd) + 1 + EXTRAPREC_BITS)) -#endif typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, @@ -47,13 +43,11 @@ typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h); -#if CONFIG_HIGHBITDEPTH typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd); -#endif #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index 11b55caa7..768875f7d 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -1,475 +1,242 @@ -## -## Copyright (c) 2017, Alliance for Open Media. All rights reserved -## -## This source code is subject to the terms of the BSD 2 Clause License and -## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -## was not distributed with this source code in the LICENSE file, you can -## obtain it at www.aomedia.org/license/software. If the Alliance for Open -## Media Patent License 1.0 was not distributed with this source code in the -## PATENTS file, you can obtain it at www.aomedia.org/license/patent. -## -if (NOT AOM_AOM_DSP_AOM_DSP_CMAKE_) +# +# Copyright (c) 2017, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and the +# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was +# not distributed with this source code in the LICENSE file, you can obtain it +# at www.aomedia.org/license/software. If the Alliance for Open Media Patent +# License 1.0 was not distributed with this source code in the PATENTS file, you +# can obtain it at www.aomedia.org/license/patent. +# +if(AOM_AOM_DSP_AOM_DSP_CMAKE_) + return() +endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_ set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1) -set(AOM_DSP_COMMON_SOURCES - "${AOM_ROOT}/aom_dsp/aom_convolve.c" - "${AOM_ROOT}/aom_dsp/aom_convolve.h" - "${AOM_ROOT}/aom_dsp/aom_dsp_common.h" - "${AOM_ROOT}/aom_dsp/aom_filter.h" - "${AOM_ROOT}/aom_dsp/aom_simd.h" - "${AOM_ROOT}/aom_dsp/aom_simd_inline.h" - "${AOM_ROOT}/aom_dsp/blend.h" - "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c" - "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" - "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c" - "${AOM_ROOT}/aom_dsp/intrapred.c" - "${AOM_ROOT}/aom_dsp/intrapred_common.h" - "${AOM_ROOT}/aom_dsp/loopfilter.c" - "${AOM_ROOT}/aom_dsp/prob.c" - "${AOM_ROOT}/aom_dsp/prob.h" - "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h" - "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h" - "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h" - "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h" - "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h" - "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h" - "${AOM_ROOT}/aom_dsp/subtract.c" - "${AOM_ROOT}/aom_dsp/txfm_common.h" - "${AOM_ROOT}/aom_dsp/x86/txfm_common_intrin.h") - -set(AOM_DSP_COMMON_ASM_SSE2 - "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm") - -set(AOM_DSP_COMMON_INTRIN_SSE2 - "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" - "${AOM_ROOT}/aom_dsp/x86/convolve.h" - "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c") - -set(AOM_DSP_COMMON_ASM_SSSE3 - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm" - "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.asm") - -set(AOM_DSP_COMMON_INTRIN_SSSE3 - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c" - "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c" - "${AOM_ROOT}/aom_dsp/x86/inv_txfm_ssse3.c") - -set(AOM_DSP_COMMON_INTRIN_SSE4_1 - "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c" - "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c" - "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c") - -set(AOM_DSP_COMMON_INTRIN_AVX2 - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/inv_txfm_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/common_avx2.h" - "${AOM_ROOT}/aom_dsp/x86/inv_txfm_common_avx2.h" - "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h") - -if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_INTRIN_AVX2 - ${AOM_DSP_COMMON_INTRIN_AVX2} - "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c") -endif () - -if (NOT CONFIG_EXT_PARTITION) - set(AOM_DSP_COMMON_ASM_NEON - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon_asm.asm") -endif () - -set(AOM_DSP_COMMON_ASM_NEON - ${AOM_DSP_COMMON_ASM_NEON} - "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/intrapred_neon_asm.asm" - "${AOM_ROOT}/aom_dsp/arm/save_reg_neon.asm") - -if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_ASM_NEON - ${AOM_DSP_COMMON_ASM_NEON} - "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.asm" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_mb_neon.asm") -endif () - -if (NOT CONFIG_EXT_PARTITION) - set(AOM_DSP_COMMON_INTRIN_NEON - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_neon.c") -endif () - -set(AOM_DSP_COMMON_INTRIN_NEON - ${AOM_DSP_COMMON_INTRIN_NEON} - "${AOM_ROOT}/aom_dsp/arm/avg_neon.c" - "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" - "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c" - "${AOM_ROOT}/aom_dsp/arm/idct16x16_neon.c" - "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" - "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c" - "${AOM_ROOT}/aom_dsp/arm/sad_neon.c" - "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c" - "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" - "${AOM_ROOT}/aom_dsp/arm/variance_neon.c") - -if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_INTRIN_NEON - ${AOM_DSP_COMMON_INTRIN_NEON} - "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c") -endif () - -if ("${AOM_TARGET_CPU}" STREQUAL "arm64") - if (NOT CONFIG_EXT_PARTITION) - set(AOM_DSP_COMMON_INTRIN_NEON - ${AOM_DSP_COMMON_INTRIN_NEON} - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_avg_neon.c" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_avg_neon.c" - "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c") - endif () - - set(AOM_DSP_COMMON_INTRIN_NEON - ${AOM_DSP_COMMON_INTRIN_NEON} - "${AOM_ROOT}/aom_dsp/arm/idct16x16_1_add_neon.c" - "${AOM_ROOT}/aom_dsp/arm/idct16x16_add_neon.c" - "${AOM_ROOT}/aom_dsp/arm/idct32x32_1_add_neon.c" - "${AOM_ROOT}/aom_dsp/arm/idct32x32_add_neon.c" - "${AOM_ROOT}/aom_dsp/arm/idct4x4_1_add_neon.c" - "${AOM_ROOT}/aom_dsp/arm/idct4x4_add_neon.c" - "${AOM_ROOT}/aom_dsp/arm/idct8x8_1_add_neon.c" - "${AOM_ROOT}/aom_dsp/arm/idct8x8_add_neon.c" - "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c") - - if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_INTRIN_NEON - ${AOM_DSP_COMMON_INTRIN_NEON} - "${AOM_ROOT}/aom_dsp/arm/loopfilter_16_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_4_neon.c" - "${AOM_ROOT}/aom_dsp/arm/loopfilter_8_neon.c") - endif () -endif () - -set(AOM_DSP_COMMON_INTRIN_DSPR2 - "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve2_avg_horiz_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve8_avg_horiz_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h") - -if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_INTRIN_DSPR2 - ${AOM_DSP_COMMON_INTRIN_DSPR2} - "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_filters_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_macros_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_masks_dspr2.h" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_mb_vert_dspr2.c") -endif () - -set(AOM_DSP_COMMON_INTRIN_MSA - "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_avg_vert_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve_avg_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c" - "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h" - "${AOM_ROOT}/aom_dsp/mips/fwd_dct32x32_msa.c" - "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.c" - "${AOM_ROOT}/aom_dsp/mips/fwd_txfm_msa.h" - "${AOM_ROOT}/aom_dsp/mips/idct16x16_msa.c" - "${AOM_ROOT}/aom_dsp/mips/idct32x32_msa.c" - "${AOM_ROOT}/aom_dsp/mips/idct4x4_msa.c" - "${AOM_ROOT}/aom_dsp/mips/idct8x8_msa.c" - "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c" - "${AOM_ROOT}/aom_dsp/mips/inv_txfm_msa.h" - "${AOM_ROOT}/aom_dsp/mips/macros_msa.h" - "${AOM_ROOT}/aom_dsp/mips/txfm_macros_msa.h") - -if (NOT CONFIG_PARALLEL_DEBLOCKING) - set(AOM_DSP_COMMON_INTRIN_MSA - ${AOM_DSP_COMMON_INTRIN_MSA} - "${AOM_ROOT}/aom_dsp/mips/loopfilter_16_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_4_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_8_msa.c" - "${AOM_ROOT}/aom_dsp/mips/loopfilter_msa.h") -endif () - -if (CONFIG_HIGHBITDEPTH) - set(AOM_DSP_COMMON_ASM_SSE2 - ${AOM_DSP_COMMON_ASM_SSE2} - "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm") - - set(AOM_DSP_COMMON_INTRIN_SSE2 - ${AOM_DSP_COMMON_INTRIN_SSE2} - "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c") - - set(AOM_DSP_COMMON_INTRIN_SSSE3 - ${AOM_DSP_COMMON_INTRIN_SSSE3} - "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_ssse3.c") - - set(AOM_DSP_COMMON_INTRIN_AVX2 - ${AOM_DSP_COMMON_INTRIN_AVX2} - "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c") -else () - set(AOM_DSP_COMMON_INTRIN_DSPR2 - ${AOM_DSP_COMMON_INTRIN_DSPR2} - "${AOM_ROOT}/aom_dsp/mips/itrans16_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/itrans32_cols_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/itrans32_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/itrans4_dspr2.c" - "${AOM_ROOT}/aom_dsp/mips/itrans8_dspr2.c") -endif () - -if (CONFIG_ANS) - set(AOM_DSP_COMMON_SOURCES - ${AOM_DSP_COMMON_SOURCES} - "${AOM_ROOT}/aom_dsp/ans.h") -else () - set(AOM_DSP_COMMON_SOURCES - ${AOM_DSP_COMMON_SOURCES} - "${AOM_ROOT}/aom_dsp/entcode.c" - "${AOM_ROOT}/aom_dsp/entcode.h") -endif () - -if (CONFIG_AV1) - set(AOM_DSP_COMMON_SOURCES - ${AOM_DSP_COMMON_SOURCES} - "${AOM_ROOT}/aom_dsp/inv_txfm.c" - "${AOM_ROOT}/aom_dsp/inv_txfm.h") - - set(AOM_DSP_COMMON_ASM_SSE2 - ${AOM_DSP_COMMON_ASM_SSE2} - "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm") - - set(AOM_DSP_COMMON_INTRIN_SSE2 - ${AOM_DSP_COMMON_INTRIN_SSE2} - "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/inv_txfm_sse2.h") -endif () - -if (CONFIG_AV1_DECODER) - set(AOM_DSP_DECODER_SOURCES - "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" - "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" - "${AOM_ROOT}/aom_dsp/bitreader.h" - "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" - "${AOM_ROOT}/aom_dsp/bitreader_buffer.h") - - if (CONFIG_ANS) - set(AOM_DSP_DECODER_SOURCES - ${AOM_DSP_DECODER_SOURCES} - "${AOM_ROOT}/aom_dsp/ansreader.h") - else () - set(AOM_DSP_DECODER_SOURCES - ${AOM_DSP_DECODER_SOURCES} - "${AOM_ROOT}/aom_dsp/daalaboolreader.c" - "${AOM_ROOT}/aom_dsp/daalaboolreader.h" - "${AOM_ROOT}/aom_dsp/entdec.c" - "${AOM_ROOT}/aom_dsp/entdec.h") - endif () -endif () - -if (CONFIG_AV1_ENCODER) - set(AOM_DSP_ENCODER_SOURCES - "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" - "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" - "${AOM_ROOT}/aom_dsp/bitwriter.h" - "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" - "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" - "${AOM_ROOT}/aom_dsp/psnr.c" - "${AOM_ROOT}/aom_dsp/psnr.h" - "${AOM_ROOT}/aom_dsp/sad.c" - "${AOM_ROOT}/aom_dsp/variance.c" - "${AOM_ROOT}/aom_dsp/variance.h") - - set(AOM_DSP_ENCODER_ASM_SSE2 - ${AOM_DSP_ENCODER_ASM_SSE2} - "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm") - - set(AOM_DSP_ENCODER_INTRIN_SSE2 - "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c") - - set(AOM_DSP_ENCODER_ASM_SSSE3 - "${AOM_ROOT}/aom_dsp/x86/sad_ssse3.asm") - - set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64 - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" - "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm") - - set(AOM_DSP_ENCODER_INTRIN_SSE3 "${AOM_ROOT}/aom_dsp/x86/sad_sse3.asm") - set(AOM_DSP_ENCODER_ASM_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/sad_sse4.asm") - - set(AOM_DSP_ENCODER_INTRIN_AVX2 - "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_avx2.h" - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_avx2.h" - "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c") - - if (CONFIG_AV1_ENCODER) - set(AOM_DSP_ENCODER_SOURCES - ${AOM_DSP_ENCODER_SOURCES} - "${AOM_ROOT}/aom_dsp/avg.c" - "${AOM_ROOT}/aom_dsp/fwd_txfm.c" - "${AOM_ROOT}/aom_dsp/fwd_txfm.h" - "${AOM_ROOT}/aom_dsp/quantize.c" - "${AOM_ROOT}/aom_dsp/quantize.h" - "${AOM_ROOT}/aom_dsp/sum_squares.c") - - set(AOM_DSP_ENCODER_INTRIN_SSE2 - ${AOM_DSP_ENCODER_INTRIN_SSE2} - "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/fwd_dct32_8cols_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/fwd_dct32x32_impl_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c") - - set(AOM_DSP_ENCODER_ASM_SSSE3_X86_64 - ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64} - "${AOM_ROOT}/aom_dsp/x86/avg_ssse3_x86_64.asm" - "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") - - set(AOM_DSP_ENCODER_AVX_ASM_X86_64 - ${AOM_DSP_ENCODER_AVX_ASM_X86_64} - "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm") - - set(AOM_DSP_ENCODER_INTRIN_MSA - "${AOM_ROOT}/aom_dsp/mips/sad_msa.c" - "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c" - "${AOM_ROOT}/aom_dsp/mips/variance_msa.c" - "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c") - - set(AOM_DSP_ENCODER_INTRIN_SSSE3 - ${AOM_DSP_ENCODER_INTRIN_SSSE3} - "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" - "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c") - - if (CONFIG_HIGHBITDEPTH) - set(AOM_DSP_ENCODER_INTRIN_SSE2 - ${AOM_DSP_ENCODER_INTRIN_SSE2} - "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c") - endif () - endif () - - if (CONFIG_HIGHBITDEPTH) - set(AOM_DSP_ENCODER_ASM_SSE2 - ${AOM_DSP_ENCODER_ASM_SSE2} - "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm") - - set(AOM_DSP_ENCODER_INTRIN_SSE2 - ${AOM_DSP_ENCODER_INTRIN_SSE2} - "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c") - - set(AOM_DSP_ENCODER_INTRIN_SSE4_1 - ${AOM_DSP_ENCODER_INTRIN_SSE4_1} - "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c") - - set(AOM_DSP_ENCODER_INTRIN_AVX2 - ${AOM_DSP_ENCODER_INTRIN_AVX2} - "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c") - endif () - - if (CONFIG_ANS) - set(AOM_DSP_ENCODER_SOURCES - ${AOM_DSP_ENCODER_SOURCES} - "${AOM_ROOT}/aom_dsp/answriter.h" - "${AOM_ROOT}/aom_dsp/buf_ans.c" - "${AOM_ROOT}/aom_dsp/buf_ans.h") - else () - set(AOM_DSP_ENCODER_SOURCES - ${AOM_DSP_ENCODER_SOURCES} - "${AOM_ROOT}/aom_dsp/daalaboolwriter.c" - "${AOM_ROOT}/aom_dsp/daalaboolwriter.h" - "${AOM_ROOT}/aom_dsp/entenc.c" - "${AOM_ROOT}/aom_dsp/entenc.h") - endif () - - if (CONFIG_INTERNAL_STATS) - set(AOM_DSP_ENCODER_SOURCES - ${AOM_DSP_ENCODER_SOURCES} - "${AOM_ROOT}/aom_dsp/fastssim.c" - "${AOM_ROOT}/aom_dsp/psnrhvs.c" - "${AOM_ROOT}/aom_dsp/ssim.c" - "${AOM_ROOT}/aom_dsp/ssim.h") - endif () -endif () - -if (CONFIG_LOOP_RESTORATION) - set(AOM_DSP_COMMON_INTRIN_SSE2 - ${AOM_DSP_COMMON_INTRIN_SSE2} - "${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_sse2.c") - - if (CONFIG_HIGHBITDEPTH) - set(AOM_DSP_COMMON_INTRIN_SSSE3 - ${AOM_DSP_COMMON_INTRIN_SSSE3} - "${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c") - endif () -endif () - -if (CONFIG_MOTION_VAR) - set(AOM_DSP_ENCODER_INTRIN_SSE4_1 - ${AOM_DSP_ENCODER_INTRIN_SSE4_1} - "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" - "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") -endif () +list(APPEND AOM_DSP_COMMON_SOURCES + "${AOM_ROOT}/aom_dsp/aom_convolve.c" + "${AOM_ROOT}/aom_dsp/aom_convolve.h" + "${AOM_ROOT}/aom_dsp/aom_dsp_common.h" + "${AOM_ROOT}/aom_dsp/aom_filter.h" + "${AOM_ROOT}/aom_dsp/aom_simd.h" + "${AOM_ROOT}/aom_dsp/aom_simd_inline.h" + "${AOM_ROOT}/aom_dsp/blend.h" + "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c" + "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" + "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c" + "${AOM_ROOT}/aom_dsp/entcode.c" + "${AOM_ROOT}/aom_dsp/entcode.h" + "${AOM_ROOT}/aom_dsp/fft.c" + "${AOM_ROOT}/aom_dsp/fft_common.h" + "${AOM_ROOT}/aom_dsp/intrapred.c" + "${AOM_ROOT}/aom_dsp/intrapred_common.h" + "${AOM_ROOT}/aom_dsp/loopfilter.c" + "${AOM_ROOT}/aom_dsp/prob.h" + "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h" + "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h" + "${AOM_ROOT}/aom_dsp/subtract.c" + "${AOM_ROOT}/aom_dsp/txfm_common.h" + "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h") + +list(APPEND AOM_DSP_COMMON_ASM_SSE2 + "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm") + +list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" + "${AOM_ROOT}/aom_dsp/x86/convolve.h" + "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h") + +list(APPEND AOM_DSP_COMMON_ASM_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm" + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm") + +list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c") + +list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c") + +list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/common_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c") + +list(APPEND AOM_DSP_COMMON_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" + "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c" + "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c" + "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c" + "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c") + +list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2 + "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h" + "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c" + "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h") + +list(APPEND AOM_DSP_COMMON_INTRIN_MSA + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c" + "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h" + "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c" + "${AOM_ROOT}/aom_dsp/mips/macros_msa.h") + +if(CONFIG_AV1_DECODER) + list(APPEND AOM_DSP_DECODER_SOURCES + "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" + "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" + "${AOM_ROOT}/aom_dsp/bitreader.h" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.h" + "${AOM_ROOT}/aom_dsp/daalaboolreader.c" + "${AOM_ROOT}/aom_dsp/daalaboolreader.h" + "${AOM_ROOT}/aom_dsp/entdec.c" + "${AOM_ROOT}/aom_dsp/entdec.h" + "${AOM_ROOT}/aom_dsp/grain_synthesis.c" + "${AOM_ROOT}/aom_dsp/grain_synthesis.h") +endif() + +if(CONFIG_AV1_ENCODER) + list(APPEND AOM_DSP_ENCODER_SOURCES + "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" + "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" + "${AOM_ROOT}/aom_dsp/bitwriter.h" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" + "${AOM_ROOT}/aom_dsp/daalaboolwriter.c" + "${AOM_ROOT}/aom_dsp/daalaboolwriter.h" + "${AOM_ROOT}/aom_dsp/entenc.c" + "${AOM_ROOT}/aom_dsp/entenc.h" + "${AOM_ROOT}/aom_dsp/fwd_txfm.c" + "${AOM_ROOT}/aom_dsp/grain_table.c" + "${AOM_ROOT}/aom_dsp/grain_table.h" + "${AOM_ROOT}/aom_dsp/noise_model.c" + "${AOM_ROOT}/aom_dsp/noise_model.h" + "${AOM_ROOT}/aom_dsp/noise_util.c" + "${AOM_ROOT}/aom_dsp/noise_util.h" + "${AOM_ROOT}/aom_dsp/psnr.c" + "${AOM_ROOT}/aom_dsp/psnr.h" + "${AOM_ROOT}/aom_dsp/quantize.c" + "${AOM_ROOT}/aom_dsp/quantize.h" + "${AOM_ROOT}/aom_dsp/sad.c" + "${AOM_ROOT}/aom_dsp/sad_av1.c" + "${AOM_ROOT}/aom_dsp/sum_squares.c" + "${AOM_ROOT}/aom_dsp/variance.c" + "${AOM_ROOT}/aom_dsp/variance.h") + + list(APPEND AOM_DSP_ENCODER_ASM_SSE2 + "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm" + "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2 + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c") + + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 + "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" + "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm") + + list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c") + + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 + "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") + + list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64 + "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h" + "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_NEON + "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c" + "${AOM_ROOT}/aom_dsp/arm/sad_neon.c" + "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c" + "${AOM_ROOT}/aom_dsp/arm/variance_neon.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c" + "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c" + "${AOM_ROOT}/aom_dsp/mips/variance_msa.c" + "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c") + + if(CONFIG_INTERNAL_STATS) + list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c" + "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c" + "${AOM_ROOT}/aom_dsp/ssim.h") + endif() +endif() # Creates aom_dsp build targets. Must not be called until after libaom target # has been created. -function (setup_aom_dsp_targets) +function(setup_aom_dsp_targets) add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES}) list(APPEND AOM_LIB_TARGETS aom_dsp_common) create_dummy_source_file("aom_av1" "c" "dummy_source_file") @@ -481,113 +248,97 @@ function (setup_aom_dsp_targets) # dummy source file to the aom_dsp target. add_dummy_source_file_to_target("aom_dsp" "c") - if (CONFIG_AV1_DECODER) + if(CONFIG_AV1_DECODER) add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES}) - set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_decoder) + list(APPEND AOM_LIB_TARGETS aom_dsp_decoder) target_sources(aom PRIVATE $) - endif () + endif() - if (CONFIG_AV1_ENCODER) + if(CONFIG_AV1_ENCODER) add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES}) - set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_dsp_encoder) + list(APPEND AOM_LIB_TARGETS aom_dsp_encoder) target_sources(aom PRIVATE $) - endif () + endif() - if (HAVE_SSE2) + if(HAVE_SSE2) add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom") add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common" - "AOM_DSP_COMMON_INTRIN_SSE2" "aom") + "AOM_DSP_COMMON_INTRIN_SSE2" "aom") - if (CONFIG_AV1_ENCODER) - add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" - "aom") + if(CONFIG_AV1_ENCODER) + add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom") add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_SSE2" "aom") endif() - endif () - - if (HAVE_SSE3 AND CONFIG_AV1_ENCODER) - add_asm_library("aom_dsp_encoder_sse3" "AOM_DSP_ENCODER_INTRIN_SSE3" "aom") - endif () + endif() - if (HAVE_SSSE3) + if(HAVE_SSSE3) add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom") add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_SSSE3" "aom") - if (CONFIG_AV1_ENCODER) - if ("${AOM_TARGET_CPU}" STREQUAL "x86_64") + if(CONFIG_AV1_ENCODER) + if("${AOM_TARGET_CPU}" STREQUAL "x86_64") list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 - ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}) - endif () + ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64}) + endif() add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom") - if (AOM_DSP_ENCODER_INTRIN_SSSE3) - add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom") - endif () - endif () - endif () - - if (HAVE_SSE4_1) + add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom") + endif() + endif() + + if(HAVE_SSE4_1) add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_SSE4_1" "aom") - if (CONFIG_AV1_ENCODER) - if (AOM_DSP_ENCODER_INTRIN_SSE4_1) - add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder" - "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom") - endif () - add_asm_library("aom_dsp_encoder_sse4_1" "AOM_DSP_ENCODER_ASM_SSE4_1" - "aom") - endif () - endif () + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom") + endif() + endif() - if (HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") - if (CONFIG_AV1_ENCODER) + if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64") + if(CONFIG_AV1_ENCODER) add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64" "aom") - endif () - endif () + endif() + endif() - if (HAVE_AVX2) + if(HAVE_AVX2) add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_AVX2" "aom") - if (CONFIG_AV1_ENCODER) + if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_AVX2" "aom") - endif () - endif () - - if (HAVE_NEON_ASM) - if (AOM_ADS2GAS_REQUIRED) - add_gas_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom") - else () - add_asm_library("aom_dsp_common_neon" "AOM_DSP_COMMON_ASM_NEON" "aom") - endif () - endif () - - if (HAVE_NEON) + endif() + endif() + + if(HAVE_NEON) add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON" "aom") - endif () + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" + "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_NEON" "aom") + endif() + endif() - if (HAVE_DSPR2) + if(HAVE_DSPR2) add_intrinsics_object_library("" "dspr2" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_DSPR2" "aom") - endif () + endif() - if (HAVE_MSA) + if(HAVE_MSA) add_intrinsics_object_library("" "msa" "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_MSA" "aom") - if (CONFIG_AV1_ENCODER) + if(CONFIG_AV1_ENCODER) add_intrinsics_object_library("" "msa" "aom_dsp_encoder" "AOM_DSP_ENCODER_INTRIN_MSA" "aom") - endif () - endif () + endif() + endif() # Pass the new lib targets up to the parent scope instance of # $AOM_LIB_TARGETS. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) -endfunction () - -endif () # AOM_AOM_DSP_AOM_DSP_CMAKE_ +endfunction() diff --git a/third_party/aom/aom_dsp/aom_dsp.mk b/third_party/aom/aom_dsp/aom_dsp.mk deleted file mode 100644 index 950db0216..000000000 --- a/third_party/aom/aom_dsp/aom_dsp.mk +++ /dev/null @@ -1,439 +0,0 @@ -## -## Copyright (c) 2016, Alliance for Open Media. All rights reserved -## -## This source code is subject to the terms of the BSD 2 Clause License and -## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -## was not distributed with this source code in the LICENSE file, you can -## obtain it at www.aomedia.org/license/software. If the Alliance for Open -## Media Patent License 1.0 was not distributed with this source code in the -## PATENTS file, you can obtain it at www.aomedia.org/license/patent. -## - - -DSP_SRCS-yes += aom_dsp.mk -DSP_SRCS-yes += aom_dsp_common.h - -DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h - -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/synonyms.h - -# bit reader -DSP_SRCS-yes += prob.h -DSP_SRCS-yes += prob.c -DSP_SRCS-$(CONFIG_ANS) += ans.h - -ifeq ($(CONFIG_AV1_ENCODER),yes) -ifeq ($(CONFIG_ANS),yes) -DSP_SRCS-yes += answriter.h -DSP_SRCS-yes += buf_ans.h -DSP_SRCS-yes += buf_ans.c -else -DSP_SRCS-yes += entenc.c -DSP_SRCS-yes += entenc.h -DSP_SRCS-yes += daalaboolwriter.c -DSP_SRCS-yes += daalaboolwriter.h -endif -DSP_SRCS-yes += bitwriter.h -DSP_SRCS-yes += bitwriter_buffer.c -DSP_SRCS-yes += bitwriter_buffer.h -DSP_SRCS-yes += binary_codes_writer.c -DSP_SRCS-yes += binary_codes_writer.h -DSP_SRCS-yes += psnr.c -DSP_SRCS-yes += psnr.h -DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c -DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h -DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c -DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c -endif - -ifeq ($(CONFIG_AV1_DECODER),yes) -ifeq ($(CONFIG_ANS),yes) -DSP_SRCS-yes += ansreader.h -else -DSP_SRCS-yes += entdec.c -DSP_SRCS-yes += entdec.h -DSP_SRCS-yes += daalaboolreader.c -DSP_SRCS-yes += daalaboolreader.h -endif -DSP_SRCS-yes += bitreader.h -DSP_SRCS-yes += bitreader_buffer.c -DSP_SRCS-yes += bitreader_buffer.h -DSP_SRCS-yes += binary_codes_reader.c -DSP_SRCS-yes += binary_codes_reader.h -endif - -# intra predictions -DSP_SRCS-yes += intrapred.c -DSP_SRCS-yes += intrapred_common.h - -ifneq ($(CONFIG_ANS),yes) -DSP_SRCS-yes += entcode.c -DSP_SRCS-yes += entcode.h -endif - -DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm - -DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.c -DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.c -DSP_SRCS-$(HAVE_AVX2) += x86/intrapred_avx2.c - -ifeq ($(CONFIG_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.c -DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_ssse3.c -DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_avx2.c -endif # CONFIG_HIGHBITDEPTH - -DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) -DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c -DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c -DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred4_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred8_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/intrapred16_dspr2.c - -DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h -DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c - -# inter predictions -DSP_SRCS-yes += blend.h -DSP_SRCS-yes += blend_a64_mask.c -DSP_SRCS-yes += blend_a64_hmask.c -DSP_SRCS-yes += blend_a64_vmask.c -DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h -DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c -DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c -DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c - -# interpolation filters -DSP_SRCS-yes += aom_convolve.c -DSP_SRCS-yes += aom_convolve.h -DSP_SRCS-yes += aom_filter.h - -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/aom_asm_stubs.c -DSP_SRCS-$(HAVE_SSE2) += x86/aom_subpixel_8t_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/aom_subpixel_bilinear_sse2.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_bilinear_ssse3.asm -DSP_SRCS-$(HAVE_AVX2) += x86/aom_subpixel_8t_intrin_avx2.c -DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_intrin_ssse3.c -ifeq ($(CONFIG_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_8t_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/aom_high_subpixel_bilinear_sse2.asm -DSP_SRCS-$(HAVE_AVX2) += x86/highbd_convolve_avx2.c -endif -DSP_SRCS-$(HAVE_SSE2) += x86/aom_convolve_copy_sse2.asm - -ifneq ($(CONFIG_EXT_PARTITION),yes) -ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/aom_convolve_copy_neon_asm$(ASM) -DSP_SRCS-yes += arm/aom_convolve8_avg_neon_asm$(ASM) -DSP_SRCS-yes += arm/aom_convolve8_neon_asm$(ASM) -DSP_SRCS-yes += arm/aom_convolve_avg_neon_asm$(ASM) -DSP_SRCS-yes += arm/aom_convolve_neon.c -else -ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/aom_convolve_copy_neon.c -DSP_SRCS-yes += arm/aom_convolve8_avg_neon.c -DSP_SRCS-yes += arm/aom_convolve8_neon.c -DSP_SRCS-yes += arm/aom_convolve_avg_neon.c -DSP_SRCS-yes += arm/aom_convolve_neon.c -endif # HAVE_NEON -endif # HAVE_NEON_ASM -endif # CONFIG_EXT_PARTITION - -# common (msa) -DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_horiz_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_avg_vert_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_horiz_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve8_vert_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_avg_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_copy_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/aom_convolve_msa.h - -# common (dspr2) -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve_common_dspr2.h -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_avg_horiz_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_horiz_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve2_vert_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_avg_horiz_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_horiz_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/convolve8_vert_dspr2.c - -# loop filters -DSP_SRCS-yes += loopfilter.c - -DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c -DSP_SRCS-$(HAVE_SSE2) += x86/lpf_common_sse2.h - -ifneq ($(CONFIG_PARALLEL_DEBLOCKING),yes) -DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c - -DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c -ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/loopfilter_mb_neon$(ASM) -DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM) -DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM) -DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM) -else -ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/loopfilter_16_neon.c -DSP_SRCS-yes += arm/loopfilter_8_neon.c -DSP_SRCS-yes += arm/loopfilter_4_neon.c -endif # HAVE_NEON -endif # HAVE_NEON_ASM - -DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_msa.h -DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_16_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_8_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_4_msa.c -DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.h -DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_filters_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_macros_dspr2.h -DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_masks_dspr2.h -DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c -endif # !CONFIG_PARALLEL_DEBLOCKING - -ifeq ($(CONFIG_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c -DSP_SRCS-$(HAVE_AVX2) += x86/highbd_loopfilter_avx2.c -endif # CONFIG_HIGHBITDEPTH - -DSP_SRCS-yes += txfm_common.h -DSP_SRCS-yes += x86/txfm_common_intrin.h -DSP_SRCS-$(HAVE_AVX2) += x86/common_avx2.h -DSP_SRCS-$(HAVE_SSE2) += x86/txfm_common_sse2.h -DSP_SRCS-$(HAVE_SSSE3) += x86/obmc_intrinsic_ssse3.h -DSP_SRCS-$(HAVE_MSA) += mips/txfm_macros_msa.h - -# forward transform -ifneq ($(findstring yes,$(CONFIG_AV1)$(CONFIG_PVQ)),) -DSP_SRCS-$(HAVE_AVX2) += x86/txfm_common_avx2.h -ifeq ($(CONFIG_AV1_ENCODER),yes) -DSP_SRCS-yes += fwd_txfm.c -DSP_SRCS-yes += fwd_txfm.h -DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.h -DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_sse2.c -DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32_8cols_sse2.c -DSP_SRCS-$(HAVE_SSE2) += x86/fwd_txfm_impl_sse2.h -DSP_SRCS-$(HAVE_SSE2) += x86/fwd_dct32x32_impl_sse2.h -ifeq ($(ARCH_X86_64),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/fwd_txfm_ssse3_x86_64.asm -endif -DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.h -DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c -DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h -DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c -DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h -DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/fwd_dct32x32_msa.c -endif # CONFIG_AV1_ENCODER -endif # CONFIG_AV1 - -# inverse transform -ifeq ($(CONFIG_AV1), yes) -DSP_SRCS-yes += inv_txfm.h -DSP_SRCS-yes += inv_txfm.c -DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h -DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c -DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c -DSP_SRCS-$(HAVE_AVX2) += x86/inv_txfm_common_avx2.h -DSP_SRCS-$(HAVE_AVX2) += x86/inv_txfm_avx2.c - -ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/save_reg_neon$(ASM) -DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) -DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM) -else -ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/idct4x4_1_add_neon.c -DSP_SRCS-yes += arm/idct4x4_add_neon.c -DSP_SRCS-yes += arm/idct8x8_1_add_neon.c -DSP_SRCS-yes += arm/idct8x8_add_neon.c -DSP_SRCS-yes += arm/idct16x16_1_add_neon.c -DSP_SRCS-yes += arm/idct16x16_add_neon.c -DSP_SRCS-yes += arm/idct32x32_1_add_neon.c -DSP_SRCS-yes += arm/idct32x32_add_neon.c -endif # HAVE_NEON -endif # HAVE_NEON_ASM -DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c - -DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h -DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/idct16x16_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/idct32x32_msa.c - -ifneq ($(CONFIG_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_DSPR2) += mips/inv_txfm_dspr2.h -DSP_SRCS-$(HAVE_DSPR2) += mips/itrans4_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c -DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c -endif # CONFIG_HIGHBITDEPTH - -ifeq ($(CONFIG_LOOP_RESTORATION),yes) -DSP_SRCS-$(HAVE_SSE2) += x86/aom_convolve_hip_sse2.c -ifeq ($(CONFIG_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/aom_highbd_convolve_hip_ssse3.c -endif -endif # CONFIG_LOOP_RESTORATION -endif # CONFIG_AV1 - -# quantization -ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),) -DSP_SRCS-yes += quantize.c -DSP_SRCS-yes += quantize.h - -DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c - -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c -DSP_SRCS-$(HAVE_AVX2) += x86/highbd_quantize_intrin_avx2.c - -ifeq ($(ARCH_X86_64),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3_x86_64.asm -DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx_x86_64.asm -endif - -# avg -DSP_SRCS-yes += avg.c -DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c -DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c -ifeq ($(ARCH_X86_64),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm -endif - -# high bit depth subtract -ifeq ($(CONFIG_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subtract_sse2.c -endif - -endif # CONFIG_AV1_ENCODER - -ifeq ($(CONFIG_AV1_ENCODER),yes) -DSP_SRCS-yes += sum_squares.c - -DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c -endif # CONFIG_AV1_ENCODER - -ifeq ($(CONFIG_AV1_ENCODER),yes) -DSP_SRCS-yes += sad.c -DSP_SRCS-yes += subtract.c - -DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c - -DSP_SRCS-$(HAVE_MSA) += mips/sad_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/subtract_msa.c - -DSP_SRCS-$(HAVE_SSE3) += x86/sad_sse3.asm -DSP_SRCS-$(HAVE_SSSE3) += x86/sad_ssse3.asm -DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm -DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c -DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c - -ifeq ($(CONFIG_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_AVX2) += x86/sad_highbd_avx2.c -endif - -ifeq ($(CONFIG_AV1_ENCODER),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/masked_sad_intrin_ssse3.c -DSP_SRCS-$(HAVE_SSSE3) += x86/masked_variance_intrin_ssse3.c -ifeq ($(CONFIG_MOTION_VAR),yes) -DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c -DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c -endif #CONFIG_MOTION_VAR -ifeq ($(CONFIG_EXT_PARTITION),yes) -DSP_SRCS-$(HAVE_AVX2) += x86/sad_impl_avx2.c -endif -endif #CONFIG_AV1_ENCODER - -DSP_SRCS-$(HAVE_SSE) += x86/sad4d_sse2.asm -DSP_SRCS-$(HAVE_SSE) += x86/sad_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm - -ifeq ($(CONFIG_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm -endif # CONFIG_HIGHBITDEPTH - -endif # CONFIG_AV1_ENCODER - -ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),) -DSP_SRCS-yes += variance.c -DSP_SRCS-yes += variance.h - -DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c - -DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c -DSP_SRCS-$(HAVE_MSA) += mips/sub_pixel_variance_msa.c - -DSP_SRCS-$(HAVE_SSE) += x86/variance_sse2.c -DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c # Contains SSE2 and SSSE3 -DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_sse2.c -DSP_SRCS-$(HAVE_SSE2) += x86/halfpix_variance_impl_sse2.asm -DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c -DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c - -ifeq ($(ARCH_X86_64),yes) -DSP_SRCS-$(HAVE_SSE2) += x86/ssim_opt_x86_64.asm -endif # ARCH_X86_64 - -DSP_SRCS-$(HAVE_SSE) += x86/subpel_variance_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/subpel_variance_sse2.asm # Contains SSE2 and SSSE3 - -ifeq ($(CONFIG_HIGHBITDEPTH),yes) -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c -DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm -DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm -endif # CONFIG_HIGHBITDEPTH -endif # CONFIG_AV1_ENCODER - -DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) - -DSP_SRCS-yes += aom_dsp_rtcd.c -DSP_SRCS-yes += aom_dsp_rtcd_defs.pl - -DSP_SRCS-yes += aom_simd.h -DSP_SRCS-yes += aom_simd_inline.h -DSP_SRCS-yes += simd/v64_intrinsics.h -DSP_SRCS-yes += simd/v64_intrinsics_c.h -DSP_SRCS-yes += simd/v128_intrinsics.h -DSP_SRCS-yes += simd/v128_intrinsics_c.h -DSP_SRCS-yes += simd/v256_intrinsics.h -DSP_SRCS-yes += simd/v256_intrinsics_c.h -DSP_SRCS-yes += simd/v256_intrinsics_v128.h -DSP_SRCS-$(HAVE_SSE2) += simd/v64_intrinsics_x86.h -DSP_SRCS-$(HAVE_SSE2) += simd/v128_intrinsics_x86.h -DSP_SRCS-$(HAVE_SSE2) += simd/v256_intrinsics_x86.h -DSP_SRCS-$(HAVE_NEON) += simd/v64_intrinsics_arm.h -DSP_SRCS-$(HAVE_NEON) += simd/v128_intrinsics_arm.h -DSP_SRCS-$(HAVE_NEON) += simd/v256_intrinsics_arm.h - -$(eval $(call rtcd_h_template,aom_dsp_rtcd,aom_dsp/aom_dsp_rtcd_defs.pl)) diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h index 3d3bcba37..c5dc9a834 100644 --- a/third_party/aom/aom_dsp/aom_dsp_common.h +++ b/third_party/aom/aom_dsp/aom_dsp_common.h @@ -12,7 +12,8 @@ #ifndef AOM_DSP_AOM_DSP_COMMON_H_ #define AOM_DSP_AOM_DSP_COMMON_H_ -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #include "aom_ports/mem.h" @@ -21,11 +22,7 @@ extern "C" { #endif #ifndef MAX_SB_SIZE -#if CONFIG_AV1 && CONFIG_EXT_PARTITION #define MAX_SB_SIZE 128 -#else -#define MAX_SB_SIZE 64 -#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION #endif // ndef MAX_SB_SIZE #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y)) @@ -52,22 +49,14 @@ extern "C" { #define UNLIKELY(v) (v) #endif -typedef uint16_t qm_val_t; +typedef uint8_t qm_val_t; #define AOM_QM_BITS 5 -#if CONFIG_HIGHBITDEPTH // Note: // tran_low_t is the datatype used for final transform coefficients. // tran_high_t is the datatype used for intermediate transform stages. typedef int64_t tran_high_t; typedef int32_t tran_low_t; -#else -// Note: -// tran_low_t is the datatype used for final transform coefficients. -// tran_high_t is the datatype used for intermediate transform stages. -typedef int32_t tran_high_t; -typedef int16_t tran_low_t; -#endif // CONFIG_HIGHBITDEPTH static INLINE uint8_t clip_pixel(int val) { return (val > 255) ? 255 : (val < 0) ? 0 : val; @@ -77,10 +66,6 @@ static INLINE int clamp(int value, int low, int high) { return value < low ? low : (value > high ? high : value); } -static INLINE uint32_t clamp32u(uint32_t value, uint32_t low, uint32_t high) { - return value < low ? low : (value > high ? high : value); -} - static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) { return value < low ? low : (value > high ? high : value); } @@ -98,6 +83,14 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) { } } +// The result of this branchless code is equivalent to (value < 0 ? 0 : value) +// or max(0, value) and might be faster in some cases. +// Care should be taken since the behavior of right shifting signed type +// negative value is undefined by C standards and implementation defined, +static INLINE unsigned int negative_to_zero(int value) { + return value & ~(value >> (sizeof(value) * 8 - 1)); +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c index 11a57d382..5d7d4515b 100644 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd.c +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c @@ -8,9 +8,11 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_config.h" +#include "config/aom_config.h" + #define RTCD_C -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_ports/aom_once.h" void aom_dsp_rtcd() { once(setup_rtcd_internal); } diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index f4f6c64d4..a8ac5eb5c 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -1,3 +1,13 @@ +## +## Copyright (c) 2017, Alliance for Open Media. All rights reserved +## +## This source code is subject to the terms of the BSD 2 Clause License and +## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +## was not distributed with this source code in the LICENSE file, you can +## obtain it at www.aomedia.org/license/software. If the Alliance for Open +## Media Patent License 1.0 was not distributed with this source code in the +## PATENTS file, you can obtain it at www.aomedia.org/license/patent. +## sub aom_dsp_forward_decls() { print <=4 && $h >=4 && ($w == 2*$h || $h == 2*$w)); + push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w)); } } -@pred_names = qw/dc dc_top dc_left dc_128 v h d207e d63e d45e d117 d135 d153 paeth smooth/; -if (aom_config("CONFIG_SMOOTH_HV") eq "yes") { - push @pred_names, qw/smooth_v smooth_h/; -} +@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/; # # Intra prediction @@ -80,73 +75,125 @@ foreach (@tx_sizes) { foreach $pred_name (@pred_names) { add_proto "void", "aom_${pred_name}_predictor_${w}x${h}", "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}", - "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - } + add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}", + "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; } } specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/; specialize qw/aom_dc_top_predictor_4x8 sse2/; +specialize qw/aom_dc_top_predictor_4x16 sse2/; specialize qw/aom_dc_top_predictor_8x4 sse2/; specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/; specialize qw/aom_dc_top_predictor_8x16 sse2/; +specialize qw/aom_dc_top_predictor_8x32 sse2/; +specialize qw/aom_dc_top_predictor_16x4 sse2/; specialize qw/aom_dc_top_predictor_16x8 sse2/; specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/; specialize qw/aom_dc_top_predictor_16x32 sse2/; +specialize qw/aom_dc_top_predictor_16x64 sse2/; +specialize qw/aom_dc_top_predictor_32x8 sse2/; specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/; specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/; specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/; specialize qw/aom_dc_left_predictor_4x8 sse2/; +specialize qw/aom_dc_left_predictor_4x16 sse2/; specialize qw/aom_dc_left_predictor_8x4 sse2/; specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/; specialize qw/aom_dc_left_predictor_8x16 sse2/; +specialize qw/aom_dc_left_predictor_8x32 sse2/; +specialize qw/aom_dc_left_predictor_16x4 sse2/; specialize qw/aom_dc_left_predictor_16x8 sse2/; specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/; specialize qw/aom_dc_left_predictor_16x32 sse2/; +specialize qw/aom_dc_left_predictor_16x64 sse2/; +specialize qw/aom_dc_left_predictor_32x8 sse2/; specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/; specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/; +specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/; +specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/; +specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/; specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/; specialize qw/aom_dc_128_predictor_4x8 sse2/; +specialize qw/aom_dc_128_predictor_4x16 sse2/; specialize qw/aom_dc_128_predictor_8x4 sse2/; specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/; specialize qw/aom_dc_128_predictor_8x16 sse2/; +specialize qw/aom_dc_128_predictor_8x32 sse2/; +specialize qw/aom_dc_128_predictor_16x4 sse2/; specialize qw/aom_dc_128_predictor_16x8 sse2/; specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/; specialize qw/aom_dc_128_predictor_16x32 sse2/; +specialize qw/aom_dc_128_predictor_16x64 sse2/; +specialize qw/aom_dc_128_predictor_32x8 sse2/; specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/; specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/; +specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/; +specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/; +specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/; specialize qw/aom_v_predictor_4x4 neon msa sse2/; specialize qw/aom_v_predictor_4x8 sse2/; +specialize qw/aom_v_predictor_4x16 sse2/; specialize qw/aom_v_predictor_8x4 sse2/; specialize qw/aom_v_predictor_8x8 neon msa sse2/; specialize qw/aom_v_predictor_8x16 sse2/; +specialize qw/aom_v_predictor_8x32 sse2/; +specialize qw/aom_v_predictor_16x4 sse2/; specialize qw/aom_v_predictor_16x8 sse2/; specialize qw/aom_v_predictor_16x16 neon msa sse2/; specialize qw/aom_v_predictor_16x32 sse2/; +specialize qw/aom_v_predictor_16x64 sse2/; +specialize qw/aom_v_predictor_32x8 sse2/; specialize qw/aom_v_predictor_32x16 sse2 avx2/; specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/; +specialize qw/aom_v_predictor_32x64 sse2 avx2/; +specialize qw/aom_v_predictor_64x64 sse2 avx2/; +specialize qw/aom_v_predictor_64x32 sse2 avx2/; +specialize qw/aom_v_predictor_64x16 sse2 avx2/; specialize qw/aom_h_predictor_4x8 sse2/; +specialize qw/aom_h_predictor_4x16 sse2/; specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/; specialize qw/aom_h_predictor_8x4 sse2/; specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/; specialize qw/aom_h_predictor_8x16 sse2/; +specialize qw/aom_h_predictor_8x32 sse2/; +specialize qw/aom_h_predictor_16x4 sse2/; specialize qw/aom_h_predictor_16x8 sse2/; specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/; specialize qw/aom_h_predictor_16x32 sse2/; +specialize qw/aom_h_predictor_16x64 sse2/; +specialize qw/aom_h_predictor_32x8 sse2/; specialize qw/aom_h_predictor_32x16 sse2/; specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/; +specialize qw/aom_h_predictor_32x64 sse2/; +specialize qw/aom_h_predictor_64x64 sse2/; +specialize qw/aom_h_predictor_64x32 sse2/; +specialize qw/aom_h_predictor_64x16 sse2/; specialize qw/aom_paeth_predictor_4x4 ssse3/; specialize qw/aom_paeth_predictor_4x8 ssse3/; +specialize qw/aom_paeth_predictor_4x16 ssse3/; specialize qw/aom_paeth_predictor_8x4 ssse3/; specialize qw/aom_paeth_predictor_8x8 ssse3/; specialize qw/aom_paeth_predictor_8x16 ssse3/; +specialize qw/aom_paeth_predictor_8x32 ssse3/; +specialize qw/aom_paeth_predictor_16x4 ssse3/; specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/; specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/; specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/; +specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/; +specialize qw/aom_paeth_predictor_32x8 ssse3/; specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/; specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/; +specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/; +specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/; +specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/; +specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/; specialize qw/aom_paeth_predictor_16x8 ssse3/; specialize qw/aom_paeth_predictor_16x16 ssse3/; specialize qw/aom_paeth_predictor_16x32 ssse3/; @@ -154,34 +201,86 @@ specialize qw/aom_paeth_predictor_32x16 ssse3/; specialize qw/aom_paeth_predictor_32x32 ssse3/; specialize qw/aom_smooth_predictor_4x4 ssse3/; specialize qw/aom_smooth_predictor_4x8 ssse3/; +specialize qw/aom_smooth_predictor_4x16 ssse3/; specialize qw/aom_smooth_predictor_8x4 ssse3/; specialize qw/aom_smooth_predictor_8x8 ssse3/; specialize qw/aom_smooth_predictor_8x16 ssse3/; +specialize qw/aom_smooth_predictor_8x32 ssse3/; +specialize qw/aom_smooth_predictor_16x4 ssse3/; specialize qw/aom_smooth_predictor_16x8 ssse3/; specialize qw/aom_smooth_predictor_16x16 ssse3/; specialize qw/aom_smooth_predictor_16x32 ssse3/; +specialize qw/aom_smooth_predictor_16x64 ssse3/; +specialize qw/aom_smooth_predictor_32x8 ssse3/; specialize qw/aom_smooth_predictor_32x16 ssse3/; specialize qw/aom_smooth_predictor_32x32 ssse3/; - -specialize qw/aom_d63e_predictor_4x4 ssse3/; -specialize qw/aom_d135_predictor_4x4 neon/; -specialize qw/aom_d153_predictor_4x4 ssse3/; +specialize qw/aom_smooth_predictor_32x64 ssse3/; +specialize qw/aom_smooth_predictor_64x64 ssse3/; +specialize qw/aom_smooth_predictor_64x32 ssse3/; +specialize qw/aom_smooth_predictor_64x16 ssse3/; + +specialize qw/aom_smooth_v_predictor_4x4 ssse3/; +specialize qw/aom_smooth_v_predictor_4x8 ssse3/; +specialize qw/aom_smooth_v_predictor_4x16 ssse3/; +specialize qw/aom_smooth_v_predictor_8x4 ssse3/; +specialize qw/aom_smooth_v_predictor_8x8 ssse3/; +specialize qw/aom_smooth_v_predictor_8x16 ssse3/; +specialize qw/aom_smooth_v_predictor_8x32 ssse3/; +specialize qw/aom_smooth_v_predictor_16x4 ssse3/; +specialize qw/aom_smooth_v_predictor_16x8 ssse3/; +specialize qw/aom_smooth_v_predictor_16x16 ssse3/; +specialize qw/aom_smooth_v_predictor_16x32 ssse3/; +specialize qw/aom_smooth_v_predictor_16x64 ssse3/; +specialize qw/aom_smooth_v_predictor_32x8 ssse3/; +specialize qw/aom_smooth_v_predictor_32x16 ssse3/; +specialize qw/aom_smooth_v_predictor_32x32 ssse3/; +specialize qw/aom_smooth_v_predictor_32x64 ssse3/; +specialize qw/aom_smooth_v_predictor_64x64 ssse3/; +specialize qw/aom_smooth_v_predictor_64x32 ssse3/; +specialize qw/aom_smooth_v_predictor_64x16 ssse3/; + +specialize qw/aom_smooth_h_predictor_4x4 ssse3/; +specialize qw/aom_smooth_h_predictor_4x8 ssse3/; +specialize qw/aom_smooth_h_predictor_4x16 ssse3/; +specialize qw/aom_smooth_h_predictor_8x4 ssse3/; +specialize qw/aom_smooth_h_predictor_8x8 ssse3/; +specialize qw/aom_smooth_h_predictor_8x16 ssse3/; +specialize qw/aom_smooth_h_predictor_8x32 ssse3/; +specialize qw/aom_smooth_h_predictor_16x4 ssse3/; +specialize qw/aom_smooth_h_predictor_16x8 ssse3/; +specialize qw/aom_smooth_h_predictor_16x16 ssse3/; +specialize qw/aom_smooth_h_predictor_16x32 ssse3/; +specialize qw/aom_smooth_h_predictor_16x64 ssse3/; +specialize qw/aom_smooth_h_predictor_32x8 ssse3/; +specialize qw/aom_smooth_h_predictor_32x16 ssse3/; +specialize qw/aom_smooth_h_predictor_32x32 ssse3/; +specialize qw/aom_smooth_h_predictor_32x64 ssse3/; +specialize qw/aom_smooth_h_predictor_64x64 ssse3/; +specialize qw/aom_smooth_h_predictor_64x32 ssse3/; +specialize qw/aom_smooth_h_predictor_64x16 ssse3/; + +# TODO(yunqingwang): optimize rectangular DC_PRED to replace division +# by multiply and shift. specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/; specialize qw/aom_dc_predictor_4x8 sse2/; -specialize qw/aom_d153_predictor_8x8 ssse3/; +specialize qw/aom_dc_predictor_4x16 sse2/; specialize qw/aom_dc_predictor_8x4 sse2/; specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/; specialize qw/aom_dc_predictor_8x16 sse2/; -specialize qw/aom_d153_predictor_16x16 ssse3/; +specialize qw/aom_dc_predictor_8x32 sse2/; +specialize qw/aom_dc_predictor_16x4 sse2/; specialize qw/aom_dc_predictor_16x8 sse2/; specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/; specialize qw/aom_dc_predictor_16x32 sse2/; -specialize qw/aom_d153_predictor_32x32 ssse3/; - +specialize qw/aom_dc_predictor_16x64 sse2/; +specialize qw/aom_dc_predictor_32x8 sse2/; specialize qw/aom_dc_predictor_32x16 sse2 avx2/; specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_dc_predictor_32x64 sse2 avx2/; +specialize qw/aom_dc_predictor_64x64 sse2 avx2/; +specialize qw/aom_dc_predictor_64x32 sse2 avx2/; +specialize qw/aom_dc_predictor_64x16 sse2 avx2/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { specialize qw/aom_highbd_v_predictor_4x4 sse2/; specialize qw/aom_highbd_v_predictor_4x8 sse2/; specialize qw/aom_highbd_v_predictor_8x4 sse2/; @@ -192,16 +291,21 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { specialize qw/aom_highbd_v_predictor_16x32 sse2/; specialize qw/aom_highbd_v_predictor_32x16 sse2/; specialize qw/aom_highbd_v_predictor_32x32 sse2/; - specialize qw/aom_highbd_dc_predictor_4x4 sse2/; + + # TODO(yunqingwang): optimize rectangular DC_PRED to replace division + # by multiply and shift. + specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/; specialize qw/aom_highbd_dc_predictor_4x8 sse2/; specialize qw/aom_highbd_dc_predictor_8x4 sse2/;; - specialize qw/aom_highbd_dc_predictor_8x8 sse2/;; + specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;; specialize qw/aom_highbd_dc_predictor_8x16 sse2/;; specialize qw/aom_highbd_dc_predictor_16x8 sse2/; - specialize qw/aom_highbd_dc_predictor_16x16 sse2/; + specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/; specialize qw/aom_highbd_dc_predictor_16x32 sse2/; specialize qw/aom_highbd_dc_predictor_32x16 sse2/; - specialize qw/aom_highbd_dc_predictor_32x32 sse2/; + specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/; + specialize qw/aom_highbd_dc_predictor_64x64 neon/; + specialize qw/aom_highbd_h_predictor_4x4 sse2/; specialize qw/aom_highbd_h_predictor_4x8 sse2/; specialize qw/aom_highbd_h_predictor_8x4 sse2/; @@ -242,253 +346,129 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/; specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/; specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/; - - specialize qw/aom_highbd_d117_predictor_4x4 sse2/; - specialize qw/aom_highbd_d117_predictor_8x8 ssse3/; - specialize qw/aom_highbd_d117_predictor_16x16 ssse3/; - specialize qw/aom_highbd_d117_predictor_32x32 ssse3/; - specialize qw/aom_highbd_d135_predictor_4x4 sse2/; - specialize qw/aom_highbd_d135_predictor_8x8 ssse3/; - specialize qw/aom_highbd_d135_predictor_16x16 ssse3/; - specialize qw/aom_highbd_d135_predictor_32x32 ssse3/; - specialize qw/aom_highbd_d153_predictor_4x4 sse2/; - specialize qw/aom_highbd_d153_predictor_8x8 ssse3/; - specialize qw/aom_highbd_d153_predictor_16x16 ssse3/; - specialize qw/aom_highbd_d153_predictor_32x32 ssse3/; - - specialize qw/aom_highbd_d45e_predictor_4x4 sse2/; - specialize qw/aom_highbd_d45e_predictor_4x8 sse2/; - specialize qw/aom_highbd_d45e_predictor_8x4 sse2/; - specialize qw/aom_highbd_d45e_predictor_8x8 sse2/; - specialize qw/aom_highbd_d45e_predictor_8x16 sse2/; - specialize qw/aom_highbd_d45e_predictor_16x8 avx2/; - specialize qw/aom_highbd_d45e_predictor_16x16 avx2/; - specialize qw/aom_highbd_d45e_predictor_16x32 avx2/; - specialize qw/aom_highbd_d45e_predictor_32x16 avx2/; - specialize qw/aom_highbd_d45e_predictor_32x32 avx2/; -} # CONFIG_HIGHBITDEPTH # # Sub Pixel Filters # add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; - -add_proto qw/void aom_convolve8_horiz_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_vert_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_avg_horiz_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_avg_vert_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; -add_proto qw/void aom_convolve8_avg_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h"; specialize qw/aom_convolve_copy sse2 /; -specialize qw/aom_convolve_avg sse2 /; -specialize qw/aom_convolve8 sse2 ssse3/, "$avx2_ssse3"; specialize qw/aom_convolve8_horiz sse2 ssse3/, "$avx2_ssse3"; specialize qw/aom_convolve8_vert sse2 ssse3/, "$avx2_ssse3"; -specialize qw/aom_convolve8_avg sse2 ssse3/; -specialize qw/aom_convolve8_avg_horiz sse2 ssse3/; -specialize qw/aom_convolve8_avg_vert sse2 ssse3/; -specialize qw/aom_scaled_2d ssse3/; - -if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") { - add_proto qw/void aom_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; - add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; - add_proto qw/void aom_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; - add_proto qw/void aom_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; - add_proto qw/void aom_convolve8_add_src_horiz_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; - add_proto qw/void aom_convolve8_add_src_vert_hip/, "const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; - - specialize qw/aom_convolve8_add_src ssse3/; - specialize qw/aom_convolve8_add_src_horiz ssse3/; - specialize qw/aom_convolve8_add_src_vert ssse3/; - specialize qw/aom_convolve8_add_src_hip sse2/; -} # CONFIG_LOOP_RESTORATION - -# TODO(any): These need to be extended to up to 128x128 block sizes -if (!(aom_config("CONFIG_AV1") eq "yes" && aom_config("CONFIG_EXT_PARTITION") eq "yes")) { - specialize qw/aom_convolve_copy neon dspr2 msa/; - specialize qw/aom_convolve_avg neon dspr2 msa/; - specialize qw/aom_convolve8 neon dspr2 msa/; - specialize qw/aom_convolve8_horiz neon dspr2 msa/; - specialize qw/aom_convolve8_vert neon dspr2 msa/; - specialize qw/aom_convolve8_avg neon dspr2 msa/; - specialize qw/aom_convolve8_avg_horiz neon dspr2 msa/; - specialize qw/aom_convolve8_avg_vert neon dspr2 msa/; -} -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/aom_highbd_convolve_copy sse2 avx2/; +add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; +specialize qw/aom_highbd_convolve_copy sse2 avx2/; - add_proto qw/void aom_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/aom_highbd_convolve_avg sse2 avx2/; +add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; +specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64"; - add_proto qw/void aom_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/aom_highbd_convolve8 avx2/, "$sse2_x86_64"; - - add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64"; - - add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64"; - - add_proto qw/void aom_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/aom_highbd_convolve8_avg avx2/, "$sse2_x86_64"; - - add_proto qw/void aom_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/aom_highbd_convolve8_avg_horiz avx2/, "$sse2_x86_64"; - - add_proto qw/void aom_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/aom_highbd_convolve8_avg_vert avx2/, "$sse2_x86_64"; - - if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") { - add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - add_proto qw/void aom_highbd_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - add_proto qw/void aom_highbd_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - add_proto qw/void aom_highbd_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - add_proto qw/void aom_highbd_convolve8_add_src_horiz_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - add_proto qw/void aom_highbd_convolve8_add_src_vert_hip/, "const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - - specialize qw/aom_highbd_convolve8_add_src/, "$sse2_x86_64"; - specialize qw/aom_highbd_convolve8_add_src_hip ssse3/; - # The _horiz/_vert functions are currently unused, so we don't bother - # specialising them. - } # CONFIG_LOOP_RESTORATION -} # CONFIG_HIGHBITDEPTH +add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; +specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64"; # # Loopfilter # -add_proto qw/void aom_lpf_vertical_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_vertical_16 sse2/; -} else { - specialize qw/aom_lpf_vertical_16 sse2 neon_asm dspr2 msa/; - $aom_lpf_vertical_16_neon_asm=aom_lpf_vertical_16_neon; -} +add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_vertical_14 sse2 neon/; -add_proto qw/void aom_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { - specialize qw/aom_lpf_vertical_16_dual sse2 neon_asm dspr2 msa/; - $aom_lpf_vertical_16_dual_neon_asm=aom_lpf_vertical_16_dual_neon; -} +add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_vertical_14_dual sse2/; + +add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_vertical_6 sse2/; add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_vertical_8 sse2/; -} else { - specialize qw/aom_lpf_vertical_8 sse2 neon dspr2 msa/; -} +specialize qw/aom_lpf_vertical_8 sse2 neon/; add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { - specialize qw/aom_lpf_vertical_8_dual sse2 neon_asm dspr2 msa/; - $aom_lpf_vertical_8_dual_neon_asm=aom_lpf_vertical_8_dual_neon; -} +specialize qw/aom_lpf_vertical_8_dual sse2/; add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_vertical_4 sse2/; -} else { - specialize qw/aom_lpf_vertical_4 sse2 neon dspr2 msa/; -} +specialize qw/aom_lpf_vertical_4 sse2/; add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { - specialize qw/aom_lpf_vertical_4_dual sse2 neon dspr2 msa/; -} +specialize qw/aom_lpf_vertical_4_dual sse2/; -add_proto qw/void aom_lpf_horizontal_edge_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_horizontal_edge_8 sse2/; -} else { - specialize qw/aom_lpf_horizontal_edge_8 sse2 avx2 neon_asm dspr2 msa/; - $aom_lpf_horizontal_edge_8_neon_asm=aom_lpf_horizontal_edge_8_neon; -} +add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_horizontal_14 sse2/; -add_proto qw/void aom_lpf_horizontal_edge_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_horizontal_edge_16 sse2/; -} else { - specialize qw/aom_lpf_horizontal_edge_16 sse2 avx2 neon_asm dspr2 msa/; - $aom_lpf_horizontal_edge_16_neon_asm=aom_lpf_horizontal_edge_16_neon; -} +add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_horizontal_14_dual sse2/; + +add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; +specialize qw/aom_lpf_horizontal_6 sse2 neon/; + +add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_horizontal_6_dual sse2/; add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_horizontal_8 sse2/; -} else { - specialize qw/aom_lpf_horizontal_8 sse2 neon dspr2 msa/; -} +specialize qw/aom_lpf_horizontal_8 sse2 neon/; add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { - specialize qw/aom_lpf_horizontal_8_dual sse2 neon_asm dspr2 msa/; - $aom_lpf_horizontal_8_dual_neon_asm=aom_lpf_horizontal_8_dual_neon; -} +specialize qw/aom_lpf_horizontal_8_dual sse2/; add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") eq "yes") { - specialize qw/aom_lpf_horizontal_4 sse2/; -} else { - specialize qw/aom_lpf_horizontal_4 sse2 neon dspr2 msa/; -} +specialize qw/aom_lpf_horizontal_4 sse2/; add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; -if (aom_config("CONFIG_PARALLEL_DEBLOCKING") ne "yes") { - specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; -} +specialize qw/aom_lpf_horizontal_4_dual sse2/; + +add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_vertical_14 sse2/; + +add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/; + +add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_vertical_8 sse2/; -if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_vertical_16 sse2/; +add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_vertical_6 sse2/; - add_proto qw/void aom_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_vertical_16_dual sse2 avx2/; +add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; +specialize qw/aom_lpf_vertical_6_dual sse2/; - add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_vertical_8 sse2/; +add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_vertical_6_dual sse2/; - add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/; +add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/; - add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_vertical_4 sse2/; +add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_vertical_4 sse2/; - add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/; +add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/; - add_proto qw/void aom_highbd_lpf_horizontal_edge_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_horizontal_edge_8 sse2/; +add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_horizontal_14 sse2/; - add_proto qw/void aom_highbd_lpf_horizontal_edge_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_horizontal_edge_16 sse2 avx2/; +add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd"; +specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/; - add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_horizontal_8 sse2/; +add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_horizontal_6 sse2/; - add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/; +add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/; - add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/aom_highbd_lpf_horizontal_4 sse2/; +add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_horizontal_8 sse2/; - add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/; -} # CONFIG_HIGHBITDEPTH +add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/; + +add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; +specialize qw/aom_highbd_lpf_horizontal_4 sse2/; + +add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; +specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/; + +# Helper functions. +add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit"; +specialize "av1_round_shift_array", qw/sse4_1 neon/; # # Encoder functions. @@ -497,170 +477,43 @@ if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { # # Forward transform # -if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq "yes")){ - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct4x4 sse2/; - - add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct4x4_1 sse2/; - +if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){ add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64"; - add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct16x16 sse2/; - - add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct32x32 sse2 avx2/; - - add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct32x32_rd sse2 avx2/; - # High bit depth - add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_highbd_fdct4x4 sse2/; - add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/aom_highbd_fdct8x8 sse2/; - add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_highbd_fdct16x16 sse2/; - - add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_highbd_fdct32x32 sse2/; - - add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_highbd_fdct32x32_rd sse2/; - - } else { - add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct4x4 sse2 msa/; + # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation) + add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output"; - add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct4x4_1 sse2/; + add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft4x4_float sse2/; - add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64"; - - add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct16x16 sse2 msa/; - - add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct32x32 sse2 avx2 msa/; - - add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/aom_fdct32x32_rd sse2 avx2 msa/; - } # CONFIG_HIGHBITDEPTH -} # CONFIG_AV1_ENCODER - -# -# Inverse transform -if (aom_config("CONFIG_AV1") eq "yes") { - add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - - add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_iwht4x4_16_add sse2/; - - add_proto qw/void aom_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - - add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct4x4_16_add sse2/; - - add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct4x4_1_add sse2/; - - add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_64_add sse2 ssse3/; - - add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_12_add sse2 ssse3/; - - add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_1_add sse2/; - - add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_256_add sse2 avx2/; - - add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_38_add avx2/; - - add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_10_add sse2 avx2/; - - add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_1_add sse2 avx2/; - - add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/; - - add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/; - # Need to add 135 eob idct32x32 implementations. - $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2; - - add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/; - - add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1_add sse2 avx2/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - } else { - add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct4x4_1_add sse2 neon dspr2 msa/; - - add_proto qw/void aom_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct4x4_16_add sse2 neon dspr2 msa/; - - add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_1_add sse2 neon dspr2 msa/; + add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft8x8_float avx2 sse2/; - add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_64_add sse2 ssse3 neon dspr2 msa/; + add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft16x16_float avx2 sse2/; - add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct8x8_12_add sse2 ssse3 neon dspr2 msa/; + add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_fft32x32_float avx2 sse2/; - add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_1_add sse2 avx2 neon dspr2 msa/; + add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output"; - add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_256_add sse2 avx2 neon dspr2 msa/; + add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft4x4_float sse2/; - add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_38_add avx2/; + add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft8x8_float avx2 sse2/; - add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct16x16_10_add sse2 avx2 neon dspr2 msa/; + add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft16x16_float avx2 sse2/; - add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2 neon dspr2 msa/; - - add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2 neon dspr2 msa/; - # Need to add 135 eob idct32x32 implementations. - $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2; - $aom_idct32x32_135_add_neon=aom_idct32x32_1024_add_neon; - $aom_idct32x32_135_add_dspr2=aom_idct32x32_1024_add_dspr2; - $aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa; - - add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2 neon dspr2 msa/; - # Need to add 34 eob idct32x32 neon implementation. - $aom_idct32x32_34_add_neon=aom_idct32x32_1024_add_neon; - - add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_idct32x32_1_add sse2 avx2 neon dspr2 msa/; - - add_proto qw/void aom_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_iwht4x4_1_add msa/; - - add_proto qw/void aom_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/aom_iwht4x4_16_add msa sse2/; - } # CONFIG_HIGHBITDEPTH -} # CONFIG_AV1 + add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output"; + specialize qw/aom_ifft32x32_float avx2 sse2/; +} # CONFIG_AV1_ENCODER # # Quantization @@ -685,29 +538,26 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; } # CONFIG_AV1_ENCODER -if (aom_config("CONFIG_AV1") eq "yes") { - # - # Alpha blending with mask - # - if (aom_config("CONFIG_CONVOLVE_ROUND") eq "yes") { - add_proto qw/void aom_blend_a64_d32_mask/, "int32_t *dst, uint32_t dst_stride, const int32_t *src0, uint32_t src0_stride, const int32_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx"; - } - add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx"; - add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w"; - add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w"; - specialize "aom_blend_a64_mask", qw/sse4_1/; - specialize "aom_blend_a64_hmask", qw/sse4_1/; - specialize "aom_blend_a64_vmask", qw/sse4_1/; - - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd"; - add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd"; - add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd"; - specialize "aom_highbd_blend_a64_mask", qw/sse4_1/; - specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/; - specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/; - } -} # CONFIG_AV1 + +# +# Alpha blending with mask +# +add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params"; +specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 neon/; +add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd"; +add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby"; +add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; +add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; +specialize "aom_blend_a64_mask", qw/sse4_1/; +specialize "aom_blend_a64_hmask", qw/sse4_1 neon/; +specialize "aom_blend_a64_vmask", qw/sse4_1 neon/; + +add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd"; +add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; +add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd"; +specialize "aom_highbd_blend_a64_mask", qw/sse4_1/; +specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/; +specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/; if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # @@ -716,6 +566,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/aom_subtract_block neon msa sse2/; + add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; + specialize qw/aom_highbd_subtract_block sse2/; + if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # # Sum of Squares @@ -728,47 +581,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { } - # - # Avg - # - if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - # - # Avg - # - specialize qw/aom_avg_8x8 sse2 neon msa/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; - specialize qw/aom_highbd_subtract_block sse2/; - } - - # - # Minmax - # - add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; - specialize qw/aom_minmax_8x8 sse2 neon/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; - } - - add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/aom_hadamard_8x8 sse2 neon/, "$ssse3_x86_64"; - - add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/aom_hadamard_16x16 sse2 neon/; - - add_proto qw/int aom_satd/, "const int16_t *coeff, int length"; - specialize qw/aom_satd sse2 neon/; - - add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, int ref_stride, int height"; - specialize qw/aom_int_pro_row sse2 neon/; - - add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, int width"; - specialize qw/aom_int_pro_col sse2 neon/; - - add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl"; - specialize qw/aom_vector_var neon sse2/; - } # CONFIG_AV1_ENCODER - # # Single block SAD / Single block Avg SAD # @@ -776,6 +588,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; + add_proto qw/unsigned int/, "aom_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param"; } specialize qw/aom_sad128x128 avx2 sse2/; @@ -812,7 +625,59 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_sad4x8_avg msa sse2/; specialize qw/aom_sad4x4_avg msa sse2/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + specialize qw/aom_sad4x16 sse2/; + specialize qw/aom_sad16x4 sse2/; + specialize qw/aom_sad8x32 sse2/; + specialize qw/aom_sad32x8 sse2/; + specialize qw/aom_sad16x64 sse2/; + specialize qw/aom_sad64x16 sse2/; + + specialize qw/aom_sad4x16_avg sse2/; + specialize qw/aom_sad16x4_avg sse2/; + specialize qw/aom_sad8x32_avg sse2/; + specialize qw/aom_sad32x8_avg sse2/; + specialize qw/aom_sad16x64_avg sse2/; + specialize qw/aom_sad64x16_avg sse2/; + + specialize qw/aom_jnt_sad128x128_avg ssse3/; + specialize qw/aom_jnt_sad128x64_avg ssse3/; + specialize qw/aom_jnt_sad64x128_avg ssse3/; + specialize qw/aom_jnt_sad64x64_avg ssse3/; + specialize qw/aom_jnt_sad64x32_avg ssse3/; + specialize qw/aom_jnt_sad32x64_avg ssse3/; + specialize qw/aom_jnt_sad32x32_avg ssse3/; + specialize qw/aom_jnt_sad32x16_avg ssse3/; + specialize qw/aom_jnt_sad16x32_avg ssse3/; + specialize qw/aom_jnt_sad16x16_avg ssse3/; + specialize qw/aom_jnt_sad16x8_avg ssse3/; + specialize qw/aom_jnt_sad8x16_avg ssse3/; + specialize qw/aom_jnt_sad8x8_avg ssse3/; + specialize qw/aom_jnt_sad8x4_avg ssse3/; + specialize qw/aom_jnt_sad4x8_avg ssse3/; + specialize qw/aom_jnt_sad4x4_avg ssse3/; + + specialize qw/aom_jnt_sad4x16_avg ssse3/; + specialize qw/aom_jnt_sad16x4_avg ssse3/; + specialize qw/aom_jnt_sad8x32_avg ssse3/; + specialize qw/aom_jnt_sad32x8_avg ssse3/; + specialize qw/aom_jnt_sad16x64_avg ssse3/; + specialize qw/aom_jnt_sad64x16_avg ssse3/; + + add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + add_proto qw/unsigned int/, "aom_sad16xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + add_proto qw/unsigned int/, "aom_sad32xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + add_proto qw/unsigned int/, "aom_sad64xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + add_proto qw/unsigned int/, "aom_sad128xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; + + specialize qw/aom_sad4xh sse2/; + specialize qw/aom_sad8xh sse2/; + specialize qw/aom_sad16xh sse2/; + specialize qw/aom_sad32xh sse2/; + specialize qw/aom_sad64xh sse2/; + specialize qw/aom_sad128xh sse2/; + + foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; @@ -821,31 +686,45 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize "aom_highbd_sad${w}x${h}", qw/sse2/; specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/; } + add_proto qw/unsigned int/, "aom_highbd_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param"; } specialize qw/aom_highbd_sad128x128 avx2/; specialize qw/aom_highbd_sad128x64 avx2/; specialize qw/aom_highbd_sad64x128 avx2/; - specialize qw/aom_highbd_sad64x64 avx2/; - specialize qw/aom_highbd_sad64x32 avx2/; - specialize qw/aom_highbd_sad32x64 avx2/; - specialize qw/aom_highbd_sad32x32 avx2/; - specialize qw/aom_highbd_sad32x16 avx2/; - specialize qw/aom_highbd_sad16x32 avx2/; - specialize qw/aom_highbd_sad16x16 avx2/; - specialize qw/aom_highbd_sad16x8 avx2/; + specialize qw/aom_highbd_sad64x64 avx2 sse2/; + specialize qw/aom_highbd_sad64x32 avx2 sse2/; + specialize qw/aom_highbd_sad32x64 avx2 sse2/; + specialize qw/aom_highbd_sad32x32 avx2 sse2/; + specialize qw/aom_highbd_sad32x16 avx2 sse2/; + specialize qw/aom_highbd_sad16x32 avx2 sse2/; + specialize qw/aom_highbd_sad16x16 avx2 sse2/; + specialize qw/aom_highbd_sad16x8 avx2 sse2/; + specialize qw/aom_highbd_sad8x4 sse2/; specialize qw/aom_highbd_sad128x128_avg avx2/; specialize qw/aom_highbd_sad128x64_avg avx2/; specialize qw/aom_highbd_sad64x128_avg avx2/; - specialize qw/aom_highbd_sad64x64_avg avx2/; - specialize qw/aom_highbd_sad64x32_avg avx2/; - specialize qw/aom_highbd_sad32x64_avg avx2/; - specialize qw/aom_highbd_sad32x32_avg avx2/; - specialize qw/aom_highbd_sad32x16_avg avx2/; - specialize qw/aom_highbd_sad16x32_avg avx2/; - specialize qw/aom_highbd_sad16x16_avg avx2/; - specialize qw/aom_highbd_sad16x8_avg avx2/; - } + specialize qw/aom_highbd_sad64x64_avg avx2 sse2/; + specialize qw/aom_highbd_sad64x32_avg avx2 sse2/; + specialize qw/aom_highbd_sad32x64_avg avx2 sse2/; + specialize qw/aom_highbd_sad32x32_avg avx2 sse2/; + specialize qw/aom_highbd_sad32x16_avg avx2 sse2/; + specialize qw/aom_highbd_sad16x32_avg avx2 sse2/; + specialize qw/aom_highbd_sad16x16_avg avx2 sse2/; + specialize qw/aom_highbd_sad16x8_avg avx2 sse2/; + specialize qw/aom_highbd_sad8x4_avg sse2/; + + specialize qw/aom_highbd_sad16x4 sse2/; + specialize qw/aom_highbd_sad8x32 sse2/; + specialize qw/aom_highbd_sad32x8 sse2/; + specialize qw/aom_highbd_sad16x64 sse2/; + specialize qw/aom_highbd_sad64x16 sse2/; + + specialize qw/aom_highbd_sad16x4_avg sse2/; + specialize qw/aom_highbd_sad8x32_avg sse2/; + specialize qw/aom_highbd_sad32x8_avg sse2/; + specialize qw/aom_highbd_sad16x64_avg sse2/; + specialize qw/aom_highbd_sad64x16_avg sse2/; # # Masked SAD @@ -856,90 +735,34 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize "aom_masked_sad${w}x${h}", qw/ssse3/; } - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask"; specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/; } - } + # # OBMC SAD # - if (aom_config("CONFIG_MOTION_VAR") eq "yes") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; + if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { + specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/; + } + } + + foreach (@block_sizes) { ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; + add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { - specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/; - } - } - - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; - if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { - specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/; - } + specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/; } } - } - # - # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally - # - # Blocks of 3 - foreach $s (@block_widths) { - add_proto qw/void/, "aom_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - } - specialize qw/aom_sad64x64x3 msa/; - specialize qw/aom_sad32x32x3 msa/; - specialize qw/aom_sad16x16x3 sse3 ssse3 msa/; - specialize qw/aom_sad8x8x3 sse3 msa/; - specialize qw/aom_sad4x4x3 sse3 msa/; - - add_proto qw/void/, "aom_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/aom_sad16x8x3 sse3 ssse3 msa/; - add_proto qw/void/, "aom_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/aom_sad8x16x3 sse3 msa/; - - # Blocks of 8 - foreach $s (@block_widths) { - add_proto qw/void/, "aom_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - } - specialize qw/aom_sad64x64x8 msa/; - specialize qw/aom_sad32x32x8 msa/; - specialize qw/aom_sad16x16x8 sse4_1 msa/; - specialize qw/aom_sad8x8x8 sse4_1 msa/; - specialize qw/aom_sad4x4x8 sse4_1 msa/; - - add_proto qw/void/, "aom_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/aom_sad16x8x8 sse4_1 msa/; - add_proto qw/void/, "aom_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/aom_sad8x16x8 sse4_1 msa/; - add_proto qw/void/, "aom_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/aom_sad8x4x8 msa/; - add_proto qw/void/, "aom_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - specialize qw/aom_sad4x8x8 msa/; - - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach $s (@block_widths) { - # Blocks of 3 - add_proto qw/void/, "aom_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - # Blocks of 8 - add_proto qw/void/, "aom_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - } - # Blocks of 3 - add_proto qw/void/, "aom_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - # Blocks of 8 - add_proto qw/void/, "aom_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - add_proto qw/void/, "aom_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; - } # # Multi-block SAD, comparing a reference to N independent blocks @@ -966,29 +789,47 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_sad4x8x4d msa sse2/; specialize qw/aom_sad4x4x4d msa sse2/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - # - # Multi-block SAD, comparing a reference to N independent blocks - # - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; - if ($w != 128 && $h != 128) { - specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/; - } + specialize qw/aom_sad4x16x4d sse2/; + specialize qw/aom_sad16x4x4d sse2/; + specialize qw/aom_sad8x32x4d sse2/; + specialize qw/aom_sad32x8x4d sse2/; + specialize qw/aom_sad16x64x4d sse2/; + specialize qw/aom_sad64x16x4d sse2/; + + # + # Multi-block SAD, comparing a reference to N independent blocks + # + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; + if ($w != 128 && $h != 128) { + specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/; } - specialize qw/aom_highbd_sad128x128x4d avx2/; - specialize qw/aom_highbd_sad128x64x4d avx2/; - specialize qw/aom_highbd_sad64x128x4d avx2/; - specialize qw/aom_highbd_sad64x64x4d avx2/; - specialize qw/aom_highbd_sad64x32x4d avx2/; - specialize qw/aom_highbd_sad32x64x4d avx2/; - specialize qw/aom_highbd_sad32x32x4d avx2/; - specialize qw/aom_highbd_sad32x16x4d avx2/; - specialize qw/aom_highbd_sad16x32x4d avx2/; - specialize qw/aom_highbd_sad16x16x4d avx2/; - specialize qw/aom_highbd_sad16x8x4d avx2/; } + specialize qw/aom_highbd_sad128x128x4d avx2/; + specialize qw/aom_highbd_sad128x64x4d avx2/; + specialize qw/aom_highbd_sad64x128x4d avx2/; + specialize qw/aom_highbd_sad64x64x4d sse2 avx2/; + specialize qw/aom_highbd_sad64x32x4d sse2 avx2/; + specialize qw/aom_highbd_sad32x64x4d sse2 avx2/; + specialize qw/aom_highbd_sad32x32x4d sse2 avx2/; + specialize qw/aom_highbd_sad32x16x4d sse2 avx2/; + specialize qw/aom_highbd_sad16x32x4d sse2 avx2/; + specialize qw/aom_highbd_sad16x16x4d sse2 avx2/; + specialize qw/aom_highbd_sad16x8x4d sse2 avx2/; + specialize qw/aom_highbd_sad8x16x4d sse2/; + specialize qw/aom_highbd_sad8x8x4d sse2/; + specialize qw/aom_highbd_sad8x4x4d sse2/; + specialize qw/aom_highbd_sad4x8x4d sse2/; + specialize qw/aom_highbd_sad4x4x4d sse2/; + + specialize qw/aom_highbd_sad4x16x4d sse2/; + specialize qw/aom_highbd_sad16x4x4d sse2/; + specialize qw/aom_highbd_sad8x32x4d sse2/; + specialize qw/aom_highbd_sad32x8x4d sse2/; + specialize qw/aom_highbd_sad16x64x4d sse2/; + specialize qw/aom_highbd_sad64x16x4d sse2/; + # # Structured Similarity (SSIM) @@ -1000,9 +841,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64"; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; - } + add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr"; + } } # CONFIG_AV1_ENCODER @@ -1015,8 +855,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/aom_get16x16var sse2 avx2 neon msa/; - specialize qw/aom_get8x8var sse2 neon msa/; + specialize qw/aom_get16x16var neon msa/; + specialize qw/aom_get8x8var neon msa/; add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; @@ -1029,7 +869,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_mse8x16 sse2 msa/; specialize qw/aom_mse8x8 sse2 msa/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { foreach $bd (8, 10, 12) { add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; @@ -1042,25 +881,48 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize "aom_highbd_${bd}_mse16x16", qw/sse2/; specialize "aom_highbd_${bd}_mse8x8", qw/sse2/; } - } + # - # ... # - add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; + # + add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, int ref_stride"; specialize qw/aom_upsampled_pred sse2/; - add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride"; + + add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride"; specialize qw/aom_comp_avg_upsampled_pred sse2/; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; - specialize qw/aom_highbd_upsampled_pred sse2/; - add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, int bd"; - specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; - } + add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const JNT_COMP_PARAMS *jcp_param"; + specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/; + + add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd"; + specialize qw/aom_highbd_upsampled_pred sse2/; + + add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd"; + specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; + + add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param"; + specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/; + + + # # - # ... # add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *"; add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; @@ -1082,27 +944,33 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param"; } - + specialize qw/aom_variance128x128 sse2 avx2 /; + specialize qw/aom_variance128x64 sse2 avx2 /; + specialize qw/aom_variance64x128 sse2 avx2 /; specialize qw/aom_variance64x64 sse2 avx2 neon msa/; specialize qw/aom_variance64x32 sse2 avx2 neon msa/; - specialize qw/aom_variance32x64 sse2 neon msa/; + specialize qw/aom_variance32x64 sse2 avx2 neon msa/; specialize qw/aom_variance32x32 sse2 avx2 neon msa/; specialize qw/aom_variance32x16 sse2 avx2 msa/; - specialize qw/aom_variance16x32 sse2 msa/; + specialize qw/aom_variance16x32 sse2 avx2 msa/; specialize qw/aom_variance16x16 sse2 avx2 neon msa/; - specialize qw/aom_variance16x8 sse2 neon msa/; + specialize qw/aom_variance16x8 sse2 avx2 neon msa/; specialize qw/aom_variance8x16 sse2 neon msa/; specialize qw/aom_variance8x8 sse2 neon msa/; specialize qw/aom_variance8x4 sse2 msa/; specialize qw/aom_variance4x8 sse2 msa/; specialize qw/aom_variance4x4 sse2 msa/; + specialize qw/aom_sub_pixel_variance128x128 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance128x64 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x128 avx2 sse2 ssse3/; specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x64 avx2 msa sse2 ssse3/; specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x16 avx2 msa sse2 ssse3/; specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/; specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/; specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/; @@ -1112,73 +980,100 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/; specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; - - if (aom_config("CONFIG_EXT_PARTITION_TYPES") eq "yes") { - specialize qw/aom_variance4x16 sse2/; - specialize qw/aom_variance16x4 sse2/; - specialize qw/aom_variance8x32 sse2/; - specialize qw/aom_variance32x8 sse2/; - specialize qw/aom_variance16x64 sse2/; - specialize qw/aom_variance64x16 sse2/; - specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/; - } - - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach $bd (8, 10, 12) { - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_sub_pixel_avg_variance128x128 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance128x64 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x128 avx2 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x64 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x16 avx2 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; + + specialize qw/aom_variance4x16 sse2/; + specialize qw/aom_variance16x4 sse2 avx2/; + specialize qw/aom_variance8x32 sse2/; + specialize qw/aom_variance32x8 sse2 avx2/; + specialize qw/aom_variance16x64 sse2 avx2/; + specialize qw/aom_variance64x16 sse2 avx2/; + specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/; + specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/; + + specialize qw/aom_jnt_sub_pixel_avg_variance64x64 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance64x32 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance32x64 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance32x32 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance32x16 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance16x32 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance16x16 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance16x8 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance8x16 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance8x8 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance8x4 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance4x8 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance4x4 ssse3/; + + specialize qw/aom_jnt_sub_pixel_avg_variance4x16 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance16x4 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance8x32 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance32x8 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance16x64 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance64x16 ssse3/; + + specialize qw/aom_jnt_sub_pixel_avg_variance128x128 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance128x64 ssse3/; + specialize qw/aom_jnt_sub_pixel_avg_variance64x128 ssse3/; + + + foreach $bd (8, 10, 12) { + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - if ($w != 128 && $h != 128 && $w != 4 && $h != 4) { - specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2"; - } - # TODO(david.barker): When ext-partition-types is enabled, we currently - # don't have vectorized 4x16 highbd variance functions - if ($w == 4 && $h == 4) { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; + if ($w != 128 && $h != 128 && $w != 4 && $h != 4) { + specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2"; + } + # TODO(david.barker): When ext-partition-types is enabled, we currently + # don't have vectorized 4x16 highbd variance functions + if ($w == 4 && $h == 4) { specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1"; } - if ($w != 128 && $h != 128 && $w != 4) { - specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/; - specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/; - } - if ($w == 4 && $h == 4) { - specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1"; - specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1"; - } + if ($w != 128 && $h != 128 && $w != 4) { + specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/; + } + if ($w == 4 && $h == 4) { + specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1"; + specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1"; } + + add_proto qw/uint32_t/, "aom_highbd_${bd}_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param"; } - } # CONFIG_HIGHBITDEPTH + } # # Masked Variance / Masked Subpixel Variance @@ -1189,7 +1084,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/; } - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + foreach $bd ("_8_", "_10_", "_12_") { foreach (@block_sizes) { ($w, $h) = @$_; @@ -1197,30 +1092,28 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/; } } - } + # # OBMC Variance / OBMC Subpixel Variance # - if (aom_config("CONFIG_MOTION_VAR") eq "yes") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - specialize "aom_obmc_variance${w}x${h}", q/sse4_1/; - } + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + specialize "aom_obmc_variance${w}x${h}", q/sse4_1/; + } - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - foreach $bd ("_", "_10_", "_12_") { - foreach (@block_sizes) { - ($w, $h) = @$_; - add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/; - } + + foreach $bd ("_", "_10_", "_12_") { + foreach (@block_sizes) { + ($w, $h) = @$_; + add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; + specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/; } } - } + add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/; @@ -1260,7 +1153,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; - # # Specialty Subpixel # @@ -1277,7 +1169,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # Comp Avg # add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { + + add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; + specialize qw/aom_jnt_comp_avg_pred ssse3/; + + add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/aom_highbd_12_variance64x64 sse2/; @@ -1415,6 +1311,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; + specialize qw/aom_highbd_jnt_comp_avg_pred sse2/; + # # Subpixel Variance # @@ -1634,14 +1533,15 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - } # CONFIG_HIGHBITDEPTH + add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - add_proto qw/void aom_comp_mask_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") { - add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subsample_x_q3, int subsample_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd"; - } + specialize qw/aom_comp_mask_pred ssse3 avx2/; + + add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd"; + } # CONFIG_AV1_ENCODER diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h index 58e8bb284..fd4f51b29 100644 --- a/third_party/aom/aom_dsp/aom_filter.h +++ b/third_party/aom/aom_dsp/aom_filter.h @@ -31,6 +31,13 @@ extern "C" { #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS) #define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2) +#define RS_SUBPEL_BITS 6 +#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1) +#define RS_SCALE_SUBPEL_BITS 14 +#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1) +#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS) +#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1)) + typedef int16_t InterpKernel[SUBPEL_TAPS]; #define BIL_SUBPEL_BITS 3 diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h index 469fd8ed2..392b36627 100644 --- a/third_party/aom/aom_dsp/aom_simd.h +++ b/third_party/aom/aom_dsp/aom_simd.h @@ -18,8 +18,9 @@ #include #endif -#include "./aom_config.h" -#include "./aom_simd_inline.h" +#include "config/aom_config.h" + +#include "aom_dsp/aom_simd_inline.h" #define SIMD_CHECK 1 // Sanity checks in C equivalents diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c deleted file mode 100644 index 09429d6d2..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon.c +++ /dev/null @@ -1,364 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, - int16x4_t dsrc2, int16x4_t dsrc3, - int16x4_t dsrc4, int16x4_t dsrc5, - int16x4_t dsrc6, int16x4_t dsrc7, - int16x8_t q0s16) { - int32x4_t qdst; - int16x4_t d0s16, d1s16; - - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - - qdst = vmull_lane_s16(dsrc0, d0s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); - qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); - return qdst; -} - -void aom_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h) { - int width; - const uint8_t *s; - uint8_t *d; - uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; - uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32; - uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - uint16x8x2_t q0x2u16; - uint8x8x2_t d0x2u8, d1x2u8; - uint32x2x2_t d0x2u32; - uint16x4x2_t d0x2u16, d1x2u16; - uint32x4x2_t q0x2u32; - - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y_step_q4; - (void)filter_y; - - q0s16 = vld1q_s16(filter_x); - - src -= 3; // adjust for taps - for (; h > 0; h -= 4) { // loop_horiz_v - s = src; - d24u8 = vld1_u8(s); - s += src_stride; - d25u8 = vld1_u8(s); - s += src_stride; - d26u8 = vld1_u8(s); - s += src_stride; - d27u8 = vld1_u8(s); - - q12u8 = vcombine_u8(d24u8, d25u8); - q13u8 = vcombine_u8(d26u8, d27u8); - - q0x2u16 = - vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); - d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); - d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); - d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); - d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); - d0x2u8 = vtrn_u8(d24u8, d25u8); - d1x2u8 = vtrn_u8(d26u8, d27u8); - - __builtin_prefetch(src + src_stride * 4); - __builtin_prefetch(src + src_stride * 5); - - q8u16 = vmovl_u8(d0x2u8.val[0]); - q9u16 = vmovl_u8(d0x2u8.val[1]); - q10u16 = vmovl_u8(d1x2u8.val[0]); - q11u16 = vmovl_u8(d1x2u8.val[1]); - - src += 7; - d16u16 = vget_low_u16(q8u16); - d17u16 = vget_high_u16(q8u16); - d18u16 = vget_low_u16(q9u16); - d19u16 = vget_high_u16(q9u16); - q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 - q9u16 = vcombine_u16(d17u16, d19u16); - - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 - for (width = w; width > 0; width -= 4, src += 4, dst += 4) { // loop_horiz - s = src; - d28u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d29u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d31u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d30u32 = vld1_dup_u32((const uint32_t *)s); - - __builtin_prefetch(src + 64); - - d0x2u16 = - vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); - d1x2u16 = - vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); - d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 - vreinterpret_u8_u16(d1x2u16.val[0])); // d29 - d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 - vreinterpret_u8_u16(d1x2u16.val[1])); // d30 - - __builtin_prefetch(src + 64 + src_stride); - - q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); - q0x2u32 = - vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); - - d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); - d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); - q12u16 = vmovl_u8(d28u8); - q13u16 = vmovl_u8(d29u8); - - __builtin_prefetch(src + 64 + src_stride * 2); - - d = dst; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); - d += dst_stride; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, - d23s16, d24s16, q0s16); - q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, - d24s16, d26s16, q0s16); - q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, - d26s16, d27s16, q0s16); - q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, - d27s16, d25s16, q0s16); - - __builtin_prefetch(src + 64 + src_stride * 3); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); - d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), - vreinterpret_u32_u16(d0x2u16.val[1])); - d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), - vreinterpret_u8_u32(d0x2u32.val[1])); - - q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); - - q1u8 = vrhaddq_u8(q1u8, q3u8); - - d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); - d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); - - d = dst; - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - - q8u16 = q9u16; - d20s16 = d23s16; - q11u16 = q12u16; - q9u16 = q13u16; - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - } - src += src_stride * 4 - w - 7; - dst += dst_stride * 4 - w; - } - return; -} - -void aom_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int height; - const uint8_t *s; - uint8_t *d; - uint8x8_t d2u8, d3u8; - uint32x2_t d2u32, d3u32, d6u32, d7u32; - uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; - uint8x16_t q1u8, q3u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - - assert(y_step_q4 == 16); - - (void)x_step_q4; - (void)y_step_q4; - (void)filter_x; - - src -= src_stride * 3; - q0s16 = vld1q_s16(filter_y); - for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h - s = src; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); - s += src_stride; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); - s += src_stride; - d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); - s += src_stride; - - q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); - q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); - q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); - q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); - - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d = dst; - for (height = h; height > 0; height -= 4) { // loop_vert - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); - s += src_stride; - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); - s += src_stride; - - q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); - q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); - - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0); - d += dst_stride; - d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0); - d += dst_stride; - d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1); - d -= dst_stride * 3; - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - __builtin_prefetch(s); - __builtin_prefetch(s + src_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, - d22s16, d24s16, q0s16); - __builtin_prefetch(s + src_stride * 2); - __builtin_prefetch(s + src_stride * 3); - q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, - d24s16, d26s16, q0s16); - __builtin_prefetch(d); - __builtin_prefetch(d + dst_stride); - q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, - d26s16, d27s16, q0s16); - __builtin_prefetch(d + dst_stride * 2); - __builtin_prefetch(d + dst_stride * 3); - q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, - d27s16, d25s16, q0s16); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - q1u8 = vcombine_u8(d2u8, d3u8); - q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32)); - - q1u8 = vrhaddq_u8(q1u8, q3u8); - - d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8)); - d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8)); - - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - d += dst_stride; - - q8u16 = q10u16; - d18s16 = d22s16; - d19s16 = d24s16; - q10u16 = q13u16; - d22s16 = d25s16; - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm deleted file mode 100644 index 80aef992d..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_avg_neon_asm.asm +++ /dev/null @@ -1,295 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - ; These functions are only valid when: - ; x_step_q4 == 16 - ; w%4 == 0 - ; h%4 == 0 - ; taps == 8 - ; AV1_FILTER_WEIGHT == 128 - ; AV1_FILTER_SHIFT == 7 - - EXPORT |aom_convolve8_avg_horiz_neon| - EXPORT |aom_convolve8_avg_vert_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Multiply and accumulate by q0 - MACRO - MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 - vmull.s16 $dst, $src0, d0[0] - vmlal.s16 $dst, $src1, d0[1] - vmlal.s16 $dst, $src2, d0[2] - vmlal.s16 $dst, $src3, d0[3] - vmlal.s16 $dst, $src4, d1[0] - vmlal.s16 $dst, $src5, d1[1] - vmlal.s16 $dst, $src6, d1[2] - vmlal.s16 $dst, $src7, d1[3] - MEND - -; r0 const uint8_t *src -; r1 int src_stride -; r2 uint8_t *dst -; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused -; sp[]int w -; sp[]int h - -|aom_convolve8_avg_horiz_neon| PROC - push {r4-r10, lr} - - sub r0, r0, #3 ; adjust for taps - - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h - - vld1.s16 {q0}, [r5] ; filter_x - - sub r8, r1, r1, lsl #2 ; -src_stride * 3 - add r8, r8, #4 ; -src_stride * 3 + 4 - - sub r4, r3, r3, lsl #2 ; -dst_stride * 3 - add r4, r4, #4 ; -dst_stride * 3 + 4 - - rsb r9, r6, r1, lsl #2 ; reset src for outer loop - sub r9, r9, #7 - rsb r12, r6, r3, lsl #2 ; reset dst for outer loop - - mov r10, r6 ; w loop counter - -aom_convolve8_avg_loop_horiz_v - vld1.8 {d24}, [r0], r1 - vld1.8 {d25}, [r0], r1 - vld1.8 {d26}, [r0], r1 - vld1.8 {d27}, [r0], r8 - - vtrn.16 q12, q13 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - - pld [r0, r1, lsl #2] - - vmovl.u8 q8, d24 - vmovl.u8 q9, d25 - vmovl.u8 q10, d26 - vmovl.u8 q11, d27 - - ; save a few instructions in the inner loop - vswp d17, d18 - vmov d23, d21 - - add r0, r0, #3 - -aom_convolve8_avg_loop_horiz - add r5, r0, #64 - - vld1.32 {d28[]}, [r0], r1 - vld1.32 {d29[]}, [r0], r1 - vld1.32 {d31[]}, [r0], r1 - vld1.32 {d30[]}, [r0], r8 - - pld [r5] - - vtrn.16 d28, d31 - vtrn.16 d29, d30 - vtrn.8 d28, d29 - vtrn.8 d31, d30 - - pld [r5, r1] - - ; extract to s16 - vtrn.32 q14, q15 - vmovl.u8 q12, d28 - vmovl.u8 q13, d29 - - pld [r5, r1, lsl #1] - - ; slightly out of order load to match the existing data - vld1.u32 {d6[0]}, [r2], r3 - vld1.u32 {d7[0]}, [r2], r3 - vld1.u32 {d6[1]}, [r2], r3 - vld1.u32 {d7[1]}, [r2], r3 - - sub r2, r2, r3, lsl #2 ; reset for store - - ; src[] * filter_x - MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 - MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 - MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 - MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 - - pld [r5, -r8] - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; transpose - vtrn.16 d2, d3 - vtrn.32 d2, d3 - vtrn.8 d2, d3 - - ; average the new value and the dst value - vrhadd.u8 q1, q1, q3 - - vst1.u32 {d2[0]}, [r2@32], r3 - vst1.u32 {d3[0]}, [r2@32], r3 - vst1.u32 {d2[1]}, [r2@32], r3 - vst1.u32 {d3[1]}, [r2@32], r4 - - vmov q8, q9 - vmov d20, d23 - vmov q11, q12 - vmov q9, q13 - - subs r6, r6, #4 ; w -= 4 - bgt aom_convolve8_avg_loop_horiz - - ; outer loop - mov r6, r10 ; restore w counter - add r0, r0, r9 ; src += src_stride * 4 - w - add r2, r2, r12 ; dst += dst_stride * 4 - w - subs r7, r7, #4 ; h -= 4 - bgt aom_convolve8_avg_loop_horiz_v - - pop {r4-r10, pc} - - ENDP - -|aom_convolve8_avg_vert_neon| PROC - push {r4-r8, lr} - - ; adjust for taps - sub r0, r0, r1 - sub r0, r0, r1, lsl #1 - - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h - - vld1.s16 {q0}, [r4] ; filter_y - - lsl r1, r1, #1 - lsl r3, r3, #1 - -aom_convolve8_avg_loop_vert_h - mov r4, r0 - add r7, r0, r1, asr #1 - mov r5, r2 - add r8, r2, r3, asr #1 - mov r12, lr ; h loop counter - - vld1.u32 {d16[0]}, [r4], r1 - vld1.u32 {d16[1]}, [r7], r1 - vld1.u32 {d18[0]}, [r4], r1 - vld1.u32 {d18[1]}, [r7], r1 - vld1.u32 {d20[0]}, [r4], r1 - vld1.u32 {d20[1]}, [r7], r1 - vld1.u32 {d22[0]}, [r4], r1 - - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 - -aom_convolve8_avg_loop_vert - ; always process a 4x4 block at a time - vld1.u32 {d24[0]}, [r7], r1 - vld1.u32 {d26[0]}, [r4], r1 - vld1.u32 {d26[1]}, [r7], r1 - vld1.u32 {d24[1]}, [r4], r1 - - ; extract to s16 - vmovl.u8 q12, d24 - vmovl.u8 q13, d26 - - vld1.u32 {d6[0]}, [r5@32], r3 - vld1.u32 {d6[1]}, [r8@32], r3 - vld1.u32 {d7[0]}, [r5@32], r3 - vld1.u32 {d7[1]}, [r8@32], r3 - - pld [r7] - pld [r4] - - ; src[] * filter_y - MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 - - pld [r7, r1] - pld [r4, r1] - - MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 - - pld [r5] - pld [r8] - - MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 - - pld [r5, r3] - pld [r8, r3] - - MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; average the new value and the dst value - vrhadd.u8 q1, q1, q3 - - sub r5, r5, r3, lsl #1 ; reset for store - sub r8, r8, r3, lsl #1 - - vst1.u32 {d2[0]}, [r5@32], r3 - vst1.u32 {d2[1]}, [r8@32], r3 - vst1.u32 {d3[0]}, [r5@32], r3 - vst1.u32 {d3[1]}, [r8@32], r3 - - vmov q8, q10 - vmov d18, d22 - vmov d19, d24 - vmov q10, q13 - vmov d22, d25 - - subs r12, r12, #4 ; h -= 4 - bgt aom_convolve8_avg_loop_vert - - ; outer loop - add r0, r0, #4 - add r2, r2, #4 - subs r6, r6, #4 ; w -= 4 - bgt aom_convolve8_avg_loop_vert_h - - pop {r4-r8, pc} - - ENDP - END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c deleted file mode 100644 index 8ebffb5f9..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom/aom_integer.h" -#include "aom_ports/mem.h" - -static INLINE int32x4_t MULTIPLY_BY_Q0(int16x4_t dsrc0, int16x4_t dsrc1, - int16x4_t dsrc2, int16x4_t dsrc3, - int16x4_t dsrc4, int16x4_t dsrc5, - int16x4_t dsrc6, int16x4_t dsrc7, - int16x8_t q0s16) { - int32x4_t qdst; - int16x4_t d0s16, d1s16; - - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - - qdst = vmull_lane_s16(dsrc0, d0s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3); - qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0); - qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1); - qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2); - qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3); - return qdst; -} - -void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h) { - int width; - const uint8_t *s, *psrc; - uint8_t *d, *pdst; - uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; - uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; - uint8x16_t q12u8, q13u8, q14u8, q15u8; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - uint16x8x2_t q0x2u16; - uint8x8x2_t d0x2u8, d1x2u8; - uint32x2x2_t d0x2u32; - uint16x4x2_t d0x2u16, d1x2u16; - uint32x4x2_t q0x2u32; - - assert(x_step_q4 == 16); - - (void)x_step_q4; - (void)y_step_q4; - (void)filter_y; - - q0s16 = vld1q_s16(filter_x); - - src -= 3; // adjust for taps - for (; h > 0; h -= 4, src += src_stride * 4, - dst += dst_stride * 4) { // loop_horiz_v - s = src; - d24u8 = vld1_u8(s); - s += src_stride; - d25u8 = vld1_u8(s); - s += src_stride; - d26u8 = vld1_u8(s); - s += src_stride; - d27u8 = vld1_u8(s); - - q12u8 = vcombine_u8(d24u8, d25u8); - q13u8 = vcombine_u8(d26u8, d27u8); - - q0x2u16 = - vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); - d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); - d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); - d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); - d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); - d0x2u8 = vtrn_u8(d24u8, d25u8); - d1x2u8 = vtrn_u8(d26u8, d27u8); - - __builtin_prefetch(src + src_stride * 4); - __builtin_prefetch(src + src_stride * 5); - __builtin_prefetch(src + src_stride * 6); - - q8u16 = vmovl_u8(d0x2u8.val[0]); - q9u16 = vmovl_u8(d0x2u8.val[1]); - q10u16 = vmovl_u8(d1x2u8.val[0]); - q11u16 = vmovl_u8(d1x2u8.val[1]); - - d16u16 = vget_low_u16(q8u16); - d17u16 = vget_high_u16(q8u16); - d18u16 = vget_low_u16(q9u16); - d19u16 = vget_high_u16(q9u16); - q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 - q9u16 = vcombine_u16(d17u16, d19u16); - - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 - for (width = w, psrc = src + 7, pdst = dst; width > 0; - width -= 4, psrc += 4, pdst += 4) { // loop_horiz - s = psrc; - d28u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d29u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d31u32 = vld1_dup_u32((const uint32_t *)s); - s += src_stride; - d30u32 = vld1_dup_u32((const uint32_t *)s); - - __builtin_prefetch(psrc + 64); - - d0x2u16 = - vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); - d1x2u16 = - vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); - d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 - vreinterpret_u8_u16(d1x2u16.val[0])); // d29 - d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 - vreinterpret_u8_u16(d1x2u16.val[1])); // d30 - - __builtin_prefetch(psrc + 64 + src_stride); - - q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); - q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); - q0x2u32 = - vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); - - d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); - d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); - q12u16 = vmovl_u8(d28u8); - q13u16 = vmovl_u8(d29u8); - - __builtin_prefetch(psrc + 64 + src_stride * 2); - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, - d23s16, d24s16, q0s16); - q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, - d24s16, d26s16, q0s16); - q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, - d26s16, d27s16, q0s16); - q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, - d27s16, d25s16, q0s16); - - __builtin_prefetch(psrc + 60 + src_stride * 3); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u8 = vqmovn_u16(q1u16); - d3u8 = vqmovn_u16(q2u16); - - d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); - d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), - vreinterpret_u32_u16(d0x2u16.val[1])); - d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), - vreinterpret_u8_u32(d0x2u32.val[1])); - - d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); - d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); - - d = pdst; - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - - q8u16 = q9u16; - d20s16 = d23s16; - q11u16 = q12u16; - q9u16 = q13u16; - d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); - } - } - return; -} - -void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int height; - const uint8_t *s; - uint8_t *d; - uint32x2_t d2u32, d3u32; - uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; - int16x4_t d24s16, d25s16, d26s16, d27s16; - uint16x4_t d2u16, d3u16, d4u16, d5u16; - int16x8_t q0s16; - uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; - int32x4_t q1s32, q2s32, q14s32, q15s32; - - assert(y_step_q4 == 16); - - (void)x_step_q4; - (void)y_step_q4; - (void)filter_x; - - src -= src_stride * 3; - q0s16 = vld1q_s16(filter_y); - for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h - s = src; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); - s += src_stride; - d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); - s += src_stride; - d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); - s += src_stride; - d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); - s += src_stride; - d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); - s += src_stride; - - q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); - q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); - q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); - q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); - - d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); - d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); - d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); - d = dst; - for (height = h; height > 0; height -= 4) { // loop_vert - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); - s += src_stride; - d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); - s += src_stride; - d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); - s += src_stride; - - q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); - q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); - - d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); - d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); - d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); - d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); - d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); - d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); - d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); - d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); - - __builtin_prefetch(d); - __builtin_prefetch(d + dst_stride); - q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, - d22s16, d24s16, q0s16); - __builtin_prefetch(d + dst_stride * 2); - __builtin_prefetch(d + dst_stride * 3); - q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, - d24s16, d26s16, q0s16); - __builtin_prefetch(s); - __builtin_prefetch(s + src_stride); - q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, - d26s16, d27s16, q0s16); - __builtin_prefetch(s + src_stride * 2); - __builtin_prefetch(s + src_stride * 3); - q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, - d27s16, d25s16, q0s16); - - d2u16 = vqrshrun_n_s32(q1s32, 7); - d3u16 = vqrshrun_n_s32(q2s32, 7); - d4u16 = vqrshrun_n_s32(q14s32, 7); - d5u16 = vqrshrun_n_s32(q15s32, 7); - - q1u16 = vcombine_u16(d2u16, d3u16); - q2u16 = vcombine_u16(d4u16, d5u16); - - d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); - d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); - - vst1_lane_u32((uint32_t *)d, d2u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d2u32, 1); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 0); - d += dst_stride; - vst1_lane_u32((uint32_t *)d, d3u32, 1); - d += dst_stride; - - q8u16 = q10u16; - d18s16 = d22s16; - d19s16 = d24s16; - q10u16 = q13u16; - d22s16 = d25s16; - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm deleted file mode 100644 index 38207d864..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_asm.asm +++ /dev/null @@ -1,273 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - ; These functions are only valid when: - ; x_step_q4 == 16 - ; w%4 == 0 - ; h%4 == 0 - ; taps == 8 - ; AV1_FILTER_WEIGHT == 128 - ; AV1_FILTER_SHIFT == 7 - - EXPORT |aom_convolve8_horiz_neon| - EXPORT |aom_convolve8_vert_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Multiply and accumulate by q0 - MACRO - MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 - vmull.s16 $dst, $src0, d0[0] - vmlal.s16 $dst, $src1, d0[1] - vmlal.s16 $dst, $src2, d0[2] - vmlal.s16 $dst, $src3, d0[3] - vmlal.s16 $dst, $src4, d1[0] - vmlal.s16 $dst, $src5, d1[1] - vmlal.s16 $dst, $src6, d1[2] - vmlal.s16 $dst, $src7, d1[3] - MEND - -; r0 const uint8_t *src -; r1 int src_stride -; r2 uint8_t *dst -; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused -; sp[]int w -; sp[]int h - -|aom_convolve8_horiz_neon| PROC - push {r4-r10, lr} - - sub r0, r0, #3 ; adjust for taps - - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h - - vld1.s16 {q0}, [r5] ; filter_x - - sub r8, r1, r1, lsl #2 ; -src_stride * 3 - add r8, r8, #4 ; -src_stride * 3 + 4 - - sub r4, r3, r3, lsl #2 ; -dst_stride * 3 - add r4, r4, #4 ; -dst_stride * 3 + 4 - - rsb r9, r6, r1, lsl #2 ; reset src for outer loop - sub r9, r9, #7 - rsb r12, r6, r3, lsl #2 ; reset dst for outer loop - - mov r10, r6 ; w loop counter - -aom_convolve8_loop_horiz_v - vld1.8 {d24}, [r0], r1 - vld1.8 {d25}, [r0], r1 - vld1.8 {d26}, [r0], r1 - vld1.8 {d27}, [r0], r8 - - vtrn.16 q12, q13 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - - pld [r0, r1, lsl #2] - - vmovl.u8 q8, d24 - vmovl.u8 q9, d25 - vmovl.u8 q10, d26 - vmovl.u8 q11, d27 - - ; save a few instructions in the inner loop - vswp d17, d18 - vmov d23, d21 - - add r0, r0, #3 - -aom_convolve8_loop_horiz - add r5, r0, #64 - - vld1.32 {d28[]}, [r0], r1 - vld1.32 {d29[]}, [r0], r1 - vld1.32 {d31[]}, [r0], r1 - vld1.32 {d30[]}, [r0], r8 - - pld [r5] - - vtrn.16 d28, d31 - vtrn.16 d29, d30 - vtrn.8 d28, d29 - vtrn.8 d31, d30 - - pld [r5, r1] - - ; extract to s16 - vtrn.32 q14, q15 - vmovl.u8 q12, d28 - vmovl.u8 q13, d29 - - pld [r5, r1, lsl #1] - - ; src[] * filter_x - MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 - MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 - MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 - MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 - - pld [r5, -r8] - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - ; transpose - vtrn.16 d2, d3 - vtrn.32 d2, d3 - vtrn.8 d2, d3 - - vst1.u32 {d2[0]}, [r2@32], r3 - vst1.u32 {d3[0]}, [r2@32], r3 - vst1.u32 {d2[1]}, [r2@32], r3 - vst1.u32 {d3[1]}, [r2@32], r4 - - vmov q8, q9 - vmov d20, d23 - vmov q11, q12 - vmov q9, q13 - - subs r6, r6, #4 ; w -= 4 - bgt aom_convolve8_loop_horiz - - ; outer loop - mov r6, r10 ; restore w counter - add r0, r0, r9 ; src += src_stride * 4 - w - add r2, r2, r12 ; dst += dst_stride * 4 - w - subs r7, r7, #4 ; h -= 4 - bgt aom_convolve8_loop_horiz_v - - pop {r4-r10, pc} - - ENDP - -|aom_convolve8_vert_neon| PROC - push {r4-r8, lr} - - ; adjust for taps - sub r0, r0, r1 - sub r0, r0, r1, lsl #1 - - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h - - vld1.s16 {q0}, [r4] ; filter_y - - lsl r1, r1, #1 - lsl r3, r3, #1 - -aom_convolve8_loop_vert_h - mov r4, r0 - add r7, r0, r1, asr #1 - mov r5, r2 - add r8, r2, r3, asr #1 - mov r12, lr ; h loop counter - - vld1.u32 {d16[0]}, [r4], r1 - vld1.u32 {d16[1]}, [r7], r1 - vld1.u32 {d18[0]}, [r4], r1 - vld1.u32 {d18[1]}, [r7], r1 - vld1.u32 {d20[0]}, [r4], r1 - vld1.u32 {d20[1]}, [r7], r1 - vld1.u32 {d22[0]}, [r4], r1 - - vmovl.u8 q8, d16 - vmovl.u8 q9, d18 - vmovl.u8 q10, d20 - vmovl.u8 q11, d22 - -aom_convolve8_loop_vert - ; always process a 4x4 block at a time - vld1.u32 {d24[0]}, [r7], r1 - vld1.u32 {d26[0]}, [r4], r1 - vld1.u32 {d26[1]}, [r7], r1 - vld1.u32 {d24[1]}, [r4], r1 - - ; extract to s16 - vmovl.u8 q12, d24 - vmovl.u8 q13, d26 - - pld [r5] - pld [r8] - - ; src[] * filter_y - MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 - - pld [r5, r3] - pld [r8, r3] - - MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 - - pld [r7] - pld [r4] - - MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 - - pld [r7, r1] - pld [r4, r1] - - MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 - - ; += 64 >> 7 - vqrshrun.s32 d2, q1, #7 - vqrshrun.s32 d3, q2, #7 - vqrshrun.s32 d4, q14, #7 - vqrshrun.s32 d5, q15, #7 - - ; saturate - vqmovn.u16 d2, q1 - vqmovn.u16 d3, q2 - - vst1.u32 {d2[0]}, [r5@32], r3 - vst1.u32 {d2[1]}, [r8@32], r3 - vst1.u32 {d3[0]}, [r5@32], r3 - vst1.u32 {d3[1]}, [r8@32], r3 - - vmov q8, q10 - vmov d18, d22 - vmov d19, d24 - vmov q10, q13 - vmov d22, d25 - - subs r12, r12, #4 ; h -= 4 - bgt aom_convolve8_loop_vert - - ; outer loop - add r0, r0, #4 - add r2, r2, #4 - subs r6, r6, #4 ; w -= 4 - bgt aom_convolve8_loop_vert_h - - pop {r4-r8, pc} - - ENDP - END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c deleted file mode 100644 index f05d3ceae..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_dsp_rtcd.h" -#include "aom/aom_integer.h" - -void aom_convolve_avg_neon(const uint8_t *src, // r0 - ptrdiff_t src_stride, // r1 - uint8_t *dst, // r2 - ptrdiff_t dst_stride, // r3 - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, - int h) { - uint8_t *d; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint32x2_t d0u32, d2u32; - uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; - - d = dst; - if (w > 32) { // avg64 - for (; h > 0; h -= 1) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - q2u8 = vld1q_u8(src + 32); - q3u8 = vld1q_u8(src + 48); - src += src_stride; - q8u8 = vld1q_u8(d); - q9u8 = vld1q_u8(d + 16); - q10u8 = vld1q_u8(d + 32); - q11u8 = vld1q_u8(d + 48); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q8u8); - q1u8 = vrhaddq_u8(q1u8, q9u8); - q2u8 = vrhaddq_u8(q2u8, q10u8); - q3u8 = vrhaddq_u8(q3u8, q11u8); - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - vst1q_u8(dst + 32, q2u8); - vst1q_u8(dst + 48, q3u8); - dst += dst_stride; - } - } else if (w == 32) { // avg32 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - src += src_stride; - q2u8 = vld1q_u8(src); - q3u8 = vld1q_u8(src + 16); - src += src_stride; - q8u8 = vld1q_u8(d); - q9u8 = vld1q_u8(d + 16); - d += dst_stride; - q10u8 = vld1q_u8(d); - q11u8 = vld1q_u8(d + 16); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q8u8); - q1u8 = vrhaddq_u8(q1u8, q9u8); - q2u8 = vrhaddq_u8(q2u8, q10u8); - q3u8 = vrhaddq_u8(q3u8, q11u8); - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - dst += dst_stride; - vst1q_u8(dst, q2u8); - vst1q_u8(dst + 16, q3u8); - dst += dst_stride; - } - } else if (w > 8) { // avg16 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - src += src_stride; - q1u8 = vld1q_u8(src); - src += src_stride; - q2u8 = vld1q_u8(d); - d += dst_stride; - q3u8 = vld1q_u8(d); - d += dst_stride; - - q0u8 = vrhaddq_u8(q0u8, q2u8); - q1u8 = vrhaddq_u8(q1u8, q3u8); - - vst1q_u8(dst, q0u8); - dst += dst_stride; - vst1q_u8(dst, q1u8); - dst += dst_stride; - } - } else if (w == 8) { // avg8 - for (; h > 0; h -= 2) { - d0u8 = vld1_u8(src); - src += src_stride; - d1u8 = vld1_u8(src); - src += src_stride; - d2u8 = vld1_u8(d); - d += dst_stride; - d3u8 = vld1_u8(d); - d += dst_stride; - - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - q0u8 = vrhaddq_u8(q0u8, q1u8); - - vst1_u8(dst, vget_low_u8(q0u8)); - dst += dst_stride; - vst1_u8(dst, vget_high_u8(q0u8)); - dst += dst_stride; - } - } else { // avg4 - for (; h > 0; h -= 2) { - d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); - src += src_stride; - d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); - src += src_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); - d += dst_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); - d += dst_stride; - - d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), vreinterpret_u8_u32(d2u32)); - - d0u32 = vreinterpret_u32_u8(d0u8); - vst1_lane_u32((uint32_t *)dst, d0u32, 0); - dst += dst_stride; - vst1_lane_u32((uint32_t *)dst, d0u32, 1); - dst += dst_stride; - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm deleted file mode 100644 index 43c300954..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve_avg_neon_asm.asm +++ /dev/null @@ -1,119 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_convolve_avg_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -|aom_convolve_avg_neon| PROC - push {r4-r6, lr} - ldrd r4, r5, [sp, #32] - mov r6, r2 - - cmp r4, #32 - bgt avg64 - beq avg32 - cmp r4, #8 - bgt avg16 - beq avg8 - b avg4 - -avg64 - sub lr, r1, #32 - sub r4, r3, #32 -avg64_h - pld [r0, r1, lsl #1] - vld1.8 {q0-q1}, [r0]! - vld1.8 {q2-q3}, [r0], lr - pld [r2, r3] - vld1.8 {q8-q9}, [r6@128]! - vld1.8 {q10-q11}, [r6@128], r4 - vrhadd.u8 q0, q0, q8 - vrhadd.u8 q1, q1, q9 - vrhadd.u8 q2, q2, q10 - vrhadd.u8 q3, q3, q11 - vst1.8 {q0-q1}, [r2@128]! - vst1.8 {q2-q3}, [r2@128], r4 - subs r5, r5, #1 - bgt avg64_h - pop {r4-r6, pc} - -avg32 - vld1.8 {q0-q1}, [r0], r1 - vld1.8 {q2-q3}, [r0], r1 - vld1.8 {q8-q9}, [r6@128], r3 - vld1.8 {q10-q11}, [r6@128], r3 - pld [r0] - vrhadd.u8 q0, q0, q8 - pld [r0, r1] - vrhadd.u8 q1, q1, q9 - pld [r6] - vrhadd.u8 q2, q2, q10 - pld [r6, r3] - vrhadd.u8 q3, q3, q11 - vst1.8 {q0-q1}, [r2@128], r3 - vst1.8 {q2-q3}, [r2@128], r3 - subs r5, r5, #2 - bgt avg32 - pop {r4-r6, pc} - -avg16 - vld1.8 {q0}, [r0], r1 - vld1.8 {q1}, [r0], r1 - vld1.8 {q2}, [r6@128], r3 - vld1.8 {q3}, [r6@128], r3 - pld [r0] - pld [r0, r1] - vrhadd.u8 q0, q0, q2 - pld [r6] - pld [r6, r3] - vrhadd.u8 q1, q1, q3 - vst1.8 {q0}, [r2@128], r3 - vst1.8 {q1}, [r2@128], r3 - subs r5, r5, #2 - bgt avg16 - pop {r4-r6, pc} - -avg8 - vld1.8 {d0}, [r0], r1 - vld1.8 {d1}, [r0], r1 - vld1.8 {d2}, [r6@64], r3 - vld1.8 {d3}, [r6@64], r3 - pld [r0] - pld [r0, r1] - vrhadd.u8 q0, q0, q1 - pld [r6] - pld [r6, r3] - vst1.8 {d0}, [r2@64], r3 - vst1.8 {d1}, [r2@64], r3 - subs r5, r5, #2 - bgt avg8 - pop {r4-r6, pc} - -avg4 - vld1.32 {d0[0]}, [r0], r1 - vld1.32 {d0[1]}, [r0], r1 - vld1.32 {d2[0]}, [r6@32], r3 - vld1.32 {d2[1]}, [r6@32], r3 - vrhadd.u8 d0, d0, d2 - vst1.32 {d0[0]}, [r2@32], r3 - vst1.32 {d0[1]}, [r2@32], r3 - subs r5, r5, #2 - bgt avg4 - pop {r4-r6, pc} - ENDP - - END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c deleted file mode 100644 index 9e57c7176..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_dsp_rtcd.h" -#include "aom/aom_integer.h" - -void aom_convolve_copy_neon(const uint8_t *src, // r0 - ptrdiff_t src_stride, // r1 - uint8_t *dst, // r2 - ptrdiff_t dst_stride, // r3 - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, - int h) { - uint8x8_t d0u8, d2u8; - uint8x16_t q0u8, q1u8, q2u8, q3u8; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; - - if (w > 32) { // copy64 - for (; h > 0; h--) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - q2u8 = vld1q_u8(src + 32); - q3u8 = vld1q_u8(src + 48); - src += src_stride; - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - vst1q_u8(dst + 32, q2u8); - vst1q_u8(dst + 48, q3u8); - dst += dst_stride; - } - } else if (w == 32) { // copy32 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - q1u8 = vld1q_u8(src + 16); - src += src_stride; - q2u8 = vld1q_u8(src); - q3u8 = vld1q_u8(src + 16); - src += src_stride; - - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - dst += dst_stride; - vst1q_u8(dst, q2u8); - vst1q_u8(dst + 16, q3u8); - dst += dst_stride; - } - } else if (w > 8) { // copy16 - for (; h > 0; h -= 2) { - q0u8 = vld1q_u8(src); - src += src_stride; - q1u8 = vld1q_u8(src); - src += src_stride; - - vst1q_u8(dst, q0u8); - dst += dst_stride; - vst1q_u8(dst, q1u8); - dst += dst_stride; - } - } else if (w == 8) { // copy8 - for (; h > 0; h -= 2) { - d0u8 = vld1_u8(src); - src += src_stride; - d2u8 = vld1_u8(src); - src += src_stride; - - vst1_u8(dst, d0u8); - dst += dst_stride; - vst1_u8(dst, d2u8); - dst += dst_stride; - } - } else { // copy4 - for (; h > 0; h--) { - *(uint32_t *)dst = *(const uint32_t *)src; - src += src_stride; - dst += dst_stride; - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm deleted file mode 100644 index 443d7178a..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon_asm.asm +++ /dev/null @@ -1,87 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_convolve_copy_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -|aom_convolve_copy_neon| PROC - push {r4-r5, lr} - ldrd r4, r5, [sp, #28] - - cmp r4, #32 - bgt copy64 - beq copy32 - cmp r4, #8 - bgt copy16 - beq copy8 - b copy4 - -copy64 - sub lr, r1, #32 - sub r3, r3, #32 -copy64_h - pld [r0, r1, lsl #1] - vld1.8 {q0-q1}, [r0]! - vld1.8 {q2-q3}, [r0], lr - vst1.8 {q0-q1}, [r2@128]! - vst1.8 {q2-q3}, [r2@128], r3 - subs r5, r5, #1 - bgt copy64_h - pop {r4-r5, pc} - -copy32 - pld [r0, r1, lsl #1] - vld1.8 {q0-q1}, [r0], r1 - pld [r0, r1, lsl #1] - vld1.8 {q2-q3}, [r0], r1 - vst1.8 {q0-q1}, [r2@128], r3 - vst1.8 {q2-q3}, [r2@128], r3 - subs r5, r5, #2 - bgt copy32 - pop {r4-r5, pc} - -copy16 - pld [r0, r1, lsl #1] - vld1.8 {q0}, [r0], r1 - pld [r0, r1, lsl #1] - vld1.8 {q1}, [r0], r1 - vst1.8 {q0}, [r2@128], r3 - vst1.8 {q1}, [r2@128], r3 - subs r5, r5, #2 - bgt copy16 - pop {r4-r5, pc} - -copy8 - pld [r0, r1, lsl #1] - vld1.8 {d0}, [r0], r1 - pld [r0, r1, lsl #1] - vld1.8 {d2}, [r0], r1 - vst1.8 {d0}, [r2@64], r3 - vst1.8 {d2}, [r2@64], r3 - subs r5, r5, #2 - bgt copy8 - pop {r4-r5, pc} - -copy4 - ldr r12, [r0], r1 - str r12, [r2], r3 - subs r5, r5, #1 - bgt copy4 - pop {r4-r5, pc} - ENDP - - END diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_neon.c deleted file mode 100644 index 6c2997e04..000000000 --- a/third_party/aom/aom_dsp/arm/aom_convolve_neon.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -void aom_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the - * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). - */ - DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); - - // Account for the vertical phase needing 3 lines prior and 4 lines post - int intermediate_height = h + 7; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - /* Filter starting 3 lines back. The neon implementation will ignore the - * given height and filter a multiple of 4 lines. Since this goes in to - * the temp buffer which has lots of extra room and is subsequently discarded - * this is safe if somewhat less than ideal. - */ - aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, - intermediate_height); - - /* Step into the temp buffer 3 lines to get the actual frame data */ - aom_convolve8_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); -} - -void aom_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); - int intermediate_height = h + 7; - - assert(y_step_q4 == 16); - assert(x_step_q4 == 16); - - /* This implementation has the same issues as above. In addition, we only want - * to average the values after both passes. - */ - aom_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, - intermediate_height); - aom_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); -} diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c deleted file mode 100644 index 6ff760017..000000000 --- a/third_party/aom/aom_dsp/arm/avg_neon.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" - -#include "aom/aom_integer.h" - -static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { - const uint32x4_t a = vpaddlq_u16(v_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -// coeff: 16 bits, dynamic range [-32640, 32640]. -// length: value range {16, 64, 256, 1024}. -int aom_satd_neon(const int16_t *coeff, int length) { - const int16x4_t zero = vdup_n_s16(0); - int32x4_t accum = vdupq_n_s32(0); - - do { - const int16x8_t src0 = vld1q_s16(coeff); - const int16x8_t src8 = vld1q_s16(coeff + 8); - accum = vabal_s16(accum, vget_low_s16(src0), zero); - accum = vabal_s16(accum, vget_high_s16(src0), zero); - accum = vabal_s16(accum, vget_low_s16(src8), zero); - accum = vabal_s16(accum, vget_high_s16(src8), zero); - length -= 16; - coeff += 16; - } while (length != 0); - - { - // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. - const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), - vreinterpret_s32_s64(vget_high_s64(s0))); - const int satd = vget_lane_s32(s1, 0); - return satd; - } -} - -void aom_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, int ref_stride, - int height) { - int i; - uint16x8_t vec_sum_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_hi = vdupq_n_u16(0); - const int shift_factor = ((height >> 5) + 3) * -1; - const int16x8_t vec_shift = vdupq_n_s16(shift_factor); - - for (i = 0; i < height; i += 8) { - const uint8x16_t vec_row1 = vld1q_u8(ref); - const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); - const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); - const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); - const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); - const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); - const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); - const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); - - ref += ref_stride * 8; - } - - vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); - vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); - - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); - hbuf += 8; - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); -} - -int16_t aom_int_pro_col_neon(uint8_t const *ref, const int width) { - int i; - uint16x8_t vec_sum = vdupq_n_u16(0); - - for (i = 0; i < width; i += 16) { - const uint8x16_t vec_row = vld1q_u8(ref); - vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); - vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); - ref += 16; - } - - return horizontal_add_u16x8(vec_sum); -} - -// ref, src = [0, 510] - max diff = 16-bits -// bwl = {2, 3, 4}, width = {16, 32, 64} -int aom_vector_var_neon(int16_t const *ref, int16_t const *src, int bwl) { - int width = 4 << bwl; - int32x4_t sse = vdupq_n_s32(0); - int16x8_t total = vdupq_n_s16(0); - - assert(width >= 8); - assert((width % 8) == 0); - - do { - const int16x8_t r = vld1q_s16(ref); - const int16x8_t s = vld1q_s16(src); - const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. - sse = vmlal_s16(sse, diff_hi, diff_hi); - total = vaddq_s16(total, diff); // dynamic range 16 bits. - - ref += 8; - src += 8; - width -= 8; - } while (width != 0); - - { - // Note: 'total''s pairwise addition could be implemented similarly to - // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired - // with the summation of 'sse' performed better on a Cortex-A15. - const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' - const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); - const int32x2_t t2 = vpadd_s32(t1, t1); - const int t = vget_lane_s32(t2, 0); - const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. - const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), - vreinterpret_s32_s64(vget_high_s64(s0))); - const int s = vget_lane_s32(s1, 0); - const int shift_factor = bwl + 2; - return s - ((t * t) >> shift_factor); - } -} - -void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int *min, int *max) { - // Load and concatenate. - const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride)); - const uint8x16_t a23 = - vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride)); - const uint8x16_t a45 = - vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride)); - const uint8x16_t a67 = - vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride)); - - const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride)); - const uint8x16_t b23 = - vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride)); - const uint8x16_t b45 = - vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride)); - const uint8x16_t b67 = - vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride)); - - // Absolute difference. - const uint8x16_t ab01_diff = vabdq_u8(a01, b01); - const uint8x16_t ab23_diff = vabdq_u8(a23, b23); - const uint8x16_t ab45_diff = vabdq_u8(a45, b45); - const uint8x16_t ab67_diff = vabdq_u8(a67, b67); - - // Max values between the Q vectors. - const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff); - const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff); - const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff); - const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff); - - const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); - const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); - - // Split to D and start doing pairwise. - uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max)); - uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min)); - - // Enough runs of vpmax/min propogate the max/min values to every position. - ab_max = vpmax_u8(ab_max, ab_max); - ab_min = vpmin_u8(ab_min, ab_min); - - ab_max = vpmax_u8(ab_max, ab_max); - ab_min = vpmin_u8(ab_min, ab_min); - - ab_max = vpmax_u8(ab_max, ab_max); - ab_min = vpmin_u8(ab_min, ab_min); - - *min = *max = 0; // Clear high bits - // Store directly to avoid costly neon->gpr transfer. - vst1_lane_u8((uint8_t *)max, ab_max, 0); - vst1_lane_u8((uint8_t *)min, ab_min, 0); -} diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c new file mode 100644 index 000000000..82c0b0e28 --- /dev/null +++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c @@ -0,0 +1,449 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom/aom_integer.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" +#include "aom_ports/mem.h" +#include "av1/common/arm/mem_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1, + const int16x8_t v_maxval, int16x8_t *res) { + int32x4_t im_res_low, im_res_high; + const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask); + + im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0)); + im_res_low = + vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1)); + + im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0)); + im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask), + vget_high_s16(src_1)); + + *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS), + vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS)); +} + +static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride, + const CONV_BUF_TYPE *src0, uint32_t src0_stride, + const CONV_BUF_TYPE *src1, uint32_t src1_stride, + int16x8_t mask0, int16x8_t mask1, int16x8_t mask2, + int16x8_t mask3, const int16x8_t v_maxval, + const uint16x8_t vec_round_offset, + const int16x8_t vec_round_bits) { + int16x8_t src0_0, src0_1, src0_2, src0_3; + int16x8_t src1_0, src1_1, src1_2, src1_3; + int16x8_t im_res_0, im_res_1, im_res_2, im_res_3; + + load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2, + &src0_3); + load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2, + &src1_3); + + blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0); + blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1); + blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2); + blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3); + + uint16x8_t im_res1_0 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset); + uint16x8_t im_res1_1 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset); + uint16x8_t im_res1_2 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset); + uint16x8_t im_res1_3 = + vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset); + + im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits); + im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits); + im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits); + im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits); + + vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0)); + vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1)); + vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2)); + vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3)); +} + +static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride, + const CONV_BUF_TYPE *src0, uint32_t src0_stride, + const CONV_BUF_TYPE *src1, uint32_t src1_stride, + int16x4_t mask0, int16x4_t mask1, int16x4_t mask2, + int16x4_t mask3, const int16x8_t v_maxval, + const uint16x8_t vec_round_offset, + const int16x8_t vec_round_bits) { + int16x8_t src0_0, src0_1; + int16x8_t src1_0, src1_1; + uint64x2_t tu0, tu1, tu2, tu3; + int16x8_t mask0_1, mask2_3; + int16x8_t res0, res1; + + load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1); + load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3); + + src0_0 = vreinterpretq_s16_u64(tu0); + src0_1 = vreinterpretq_s16_u64(tu1); + + src1_0 = vreinterpretq_s16_u64(tu2); + src1_1 = vreinterpretq_s16_u64(tu3); + + mask0_1 = vcombine_s16(mask0, mask1); + mask2_3 = vcombine_s16(mask2, mask3); + + blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0); + blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1); + + uint16x8_t im_res_0 = + vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset); + uint16x8_t im_res_1 = + vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset); + + src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits); + src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits); + + uint8x8_t res_0 = vqmovun_s16(src0_0); + uint8x8_t res_1 = vqmovun_s16(src0_1); + + vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0), + 0); + vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0), + 1); + vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1), + 0); + vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1), + 1); +} + +void aom_lowbd_blend_a64_d16_mask_neon( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + int i = 0; + const int bd = 8; + int w_tmp = w; + const uint8_t *mask_tmp = mask; + const CONV_BUF_TYPE *src0_tmp = src0; + const CONV_BUF_TYPE *src1_tmp = src1; + uint8_t *dst_tmp = dst; + + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + uint8x8_t s0, s1, s2, s3; + uint32x2_t tu0, tu1, tu2, tu3; + uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; + int16x8_t mask0, mask1, mask2, mask3; + int16x8_t mask4, mask5, mask6, mask7; + int32x4_t m0_32, m1_32, m2_32, m3_32; + int32x4_t m4_32, m5_32, m6_32, m7_32; + uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l; + uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l; + int16x4_t mask0_low, mask1_low, mask2_low, mask3_low; + const uint16x4_t vec_zero = vdup_n_u16(0); + const uint16_t offset = round_offset - (1 << (round_bits - 1)); + const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA); + const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits); + const uint16x8_t vec_offset = vdupq_n_u16(offset); + + if (subw == 0 && subh == 0) { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3); + + mask0 = vmovl_s8(vreinterpret_s8_u8(s0)); + mask1 = vmovl_s8(vreinterpret_s8_u8(s1)); + mask2 = vmovl_s8(vreinterpret_s8_u8(s2)); + mask3 = vmovl_s8(vreinterpret_s8_u8(s3)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + + w_tmp -= 8; + mask_tmp += 8; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (4 * mask_stride) - w; + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1); + + mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0))); + mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1))); + + mask0_low = vget_low_s16(mask0); + mask1_low = vget_high_s16(mask0); + mask2_low = vget_low_s16(mask1); + mask3_low = vget_high_s16(mask1); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (4 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } else if (subw == 1 && subh == 1) { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, + &t7); + + mask0 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1))); + mask1 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3))); + mask2 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5))); + mask3 = + vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7))); + + mask4 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t0), vget_high_u8(t1))); + mask5 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t2), vget_high_u8(t3))); + mask6 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t4), vget_high_u8(t5))); + mask7 = vreinterpretq_s16_u16( + vaddl_u8(vget_high_u8(t6), vget_high_u8(t7))); + + m0_32 = vpaddlq_s16(mask0); + m1_32 = vpaddlq_s16(mask1); + m2_32 = vpaddlq_s16(mask2); + m3_32 = vpaddlq_s16(mask3); + + m4_32 = vpaddlq_s16(mask4); + m5_32 = vpaddlq_s16(mask5); + m6_32 = vpaddlq_s16(mask6); + m7_32 = vpaddlq_s16(mask7); + + mask0 = + vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2)); + mask1 = + vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2)); + mask2 = + vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2)); + mask3 = + vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + + w_tmp -= 8; + mask_tmp += 16; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (8 * mask_stride) - (2 * w); + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, + &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l); + + mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l)); + mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l)); + mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l)); + mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l)); + + m0_32 = vpaddlq_s16(mask0); + m1_32 = vpaddlq_s16(mask1); + m2_32 = vpaddlq_s16(mask2); + m3_32 = vpaddlq_s16(mask3); + + mask0_low = vqrshrn_n_s32(m0_32, 2); + mask1_low = vqrshrn_n_s32(m1_32, 2); + mask2_low = vqrshrn_n_s32(m2_32, 2); + mask3_low = vqrshrn_n_s32(m3_32, 2); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (8 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } else if (subw == 1 && subh == 0) { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3); + + mask0 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0)))); + mask1 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1)))); + mask2 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2)))); + mask3 = vreinterpretq_s16_u16(vcombine_u16( + vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3)))); + + mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); + mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); + mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1)); + mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + w_tmp -= 8; + mask_tmp += 16; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (4 * mask_stride) - (2 * w); + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, + &mask3_l); + + mask0 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero)); + mask1 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero)); + mask2 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero)); + mask3 = + vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero)); + + mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1))); + mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1))); + mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1))); + mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1))); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (4 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } else { + if (w_tmp > 7) { + do { + w_tmp = w; + do { + load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l, + &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l); + + mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l)); + mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l)); + mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l)); + mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l)); + + mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); + mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); + mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1)); + mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1)); + + blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0, mask1, mask2, mask3, v_maxval, + vec_offset, vec_round_bits); + + w_tmp -= 8; + mask_tmp += 8; + dst_tmp += 8; + src0_tmp += 8; + src1_tmp += 8; + } while (w_tmp > 7); + i += 4; + mask_tmp += (8 * mask_stride) - w; + dst_tmp += (4 * dst_stride) - w; + src0_tmp += (4 * src0_stride) - w; + src1_tmp += (4 * src1_stride) - w; + } while (i < h); + } else { + do { + load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1); + load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2, + &tu3); + + s0 = vreinterpret_u8_u32(tu0); + s1 = vreinterpret_u8_u32(tu1); + s2 = vreinterpret_u8_u32(tu2); + s3 = vreinterpret_u8_u32(tu3); + + mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2)); + mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3)); + + mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1)); + mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1)); + + mask0_low = vget_low_s16(mask0); + mask1_low = vget_high_s16(mask0); + mask2_low = vget_low_s16(mask1); + mask3_low = vget_high_s16(mask1); + + blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp, + src1_stride, mask0_low, mask1_low, mask2_low, mask3_low, + v_maxval, vec_offset, vec_round_bits); + + i += 4; + mask_tmp += (8 * mask_stride); + dst_tmp += (4 * dst_stride); + src0_tmp += (4 * src0_stride); + src1_tmp += (4 * src1_stride); + } while (i < h); + } + } +} diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c index 1cf8a3a6e..e4300c992 100644 --- a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c +++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c @@ -11,7 +11,8 @@ #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom_dsp/txfm_common.h" void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c deleted file mode 100644 index 9baefae47..000000000 --- a/third_party/aom/aom_dsp/arm/hadamard_neon.c +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_dsp_rtcd.h" - -static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, - int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, - int16x8_t *a6, int16x8_t *a7) { - const int16x8_t b0 = vaddq_s16(*a0, *a1); - const int16x8_t b1 = vsubq_s16(*a0, *a1); - const int16x8_t b2 = vaddq_s16(*a2, *a3); - const int16x8_t b3 = vsubq_s16(*a2, *a3); - const int16x8_t b4 = vaddq_s16(*a4, *a5); - const int16x8_t b5 = vsubq_s16(*a4, *a5); - const int16x8_t b6 = vaddq_s16(*a6, *a7); - const int16x8_t b7 = vsubq_s16(*a6, *a7); - - const int16x8_t c0 = vaddq_s16(b0, b2); - const int16x8_t c1 = vaddq_s16(b1, b3); - const int16x8_t c2 = vsubq_s16(b0, b2); - const int16x8_t c3 = vsubq_s16(b1, b3); - const int16x8_t c4 = vaddq_s16(b4, b6); - const int16x8_t c5 = vaddq_s16(b5, b7); - const int16x8_t c6 = vsubq_s16(b4, b6); - const int16x8_t c7 = vsubq_s16(b5, b7); - - *a0 = vaddq_s16(c0, c4); - *a1 = vsubq_s16(c2, c6); - *a2 = vsubq_s16(c0, c4); - *a3 = vaddq_s16(c2, c6); - *a4 = vaddq_s16(c3, c7); - *a5 = vsubq_s16(c3, c7); - *a6 = vsubq_s16(c1, c5); - *a7 = vaddq_s16(c1, c5); -} - -// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider -// reversing transpose order which may make it easier for the compiler to -// reconcile the vtrn.64 moves. -static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, - int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, - int16x8_t *a6, int16x8_t *a7) { - // Swap 64 bit elements. Goes from: - // a0: 00 01 02 03 04 05 06 07 - // a1: 08 09 10 11 12 13 14 15 - // a2: 16 17 18 19 20 21 22 23 - // a3: 24 25 26 27 28 29 30 31 - // a4: 32 33 34 35 36 37 38 39 - // a5: 40 41 42 43 44 45 46 47 - // a6: 48 49 50 51 52 53 54 55 - // a7: 56 57 58 59 60 61 62 63 - // to: - // a04_lo: 00 01 02 03 32 33 34 35 - // a15_lo: 08 09 10 11 40 41 42 43 - // a26_lo: 16 17 18 19 48 49 50 51 - // a37_lo: 24 25 26 27 56 57 58 59 - // a04_hi: 04 05 06 07 36 37 38 39 - // a15_hi: 12 13 14 15 44 45 46 47 - // a26_hi: 20 21 22 23 52 53 54 55 - // a37_hi: 28 29 30 31 60 61 62 63 - const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4)); - const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5)); - const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6)); - const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7)); - const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4)); - const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5)); - const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6)); - const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7)); - - // Swap 32 bit elements resulting in: - // a0246_lo: - // 00 01 16 17 32 33 48 49 - // 02 03 18 19 34 35 50 51 - // a1357_lo: - // 08 09 24 25 40 41 56 57 - // 10 11 26 27 42 43 58 59 - // a0246_hi: - // 04 05 20 21 36 37 52 53 - // 06 07 22 23 38 39 54 55 - // a1657_hi: - // 12 13 28 29 44 45 60 61 - // 14 15 30 31 46 47 62 63 - const int32x4x2_t a0246_lo = - vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo)); - const int32x4x2_t a1357_lo = - vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo)); - const int32x4x2_t a0246_hi = - vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi)); - const int32x4x2_t a1357_hi = - vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi)); - - // Swap 16 bit elements resulting in: - // b0: - // 00 08 16 24 32 40 48 56 - // 01 09 17 25 33 41 49 57 - // b1: - // 02 10 18 26 34 42 50 58 - // 03 11 19 27 35 43 51 59 - // b2: - // 04 12 20 28 36 44 52 60 - // 05 13 21 29 37 45 53 61 - // b3: - // 06 14 22 30 38 46 54 62 - // 07 15 23 31 39 47 55 63 - const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]), - vreinterpretq_s16_s32(a1357_lo.val[0])); - const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]), - vreinterpretq_s16_s32(a1357_lo.val[1])); - const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]), - vreinterpretq_s16_s32(a1357_hi.val[0])); - const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]), - vreinterpretq_s16_s32(a1357_hi.val[1])); - - *a0 = b0.val[0]; - *a1 = b0.val[1]; - *a2 = b1.val[0]; - *a3 = b1.val[1]; - *a4 = b2.val[0]; - *a5 = b2.val[1]; - *a6 = b3.val[0]; - *a7 = b3.val[1]; -} - -void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { - int16x8_t a0 = vld1q_s16(src_diff); - int16x8_t a1 = vld1q_s16(src_diff + src_stride); - int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); - int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); - int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); - int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); - int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); - int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); - - hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - - transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - - hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - - // Skip the second transpose because it is not required. - - vst1q_s16(coeff + 0, a0); - vst1q_s16(coeff + 8, a1); - vst1q_s16(coeff + 16, a2); - vst1q_s16(coeff + 24, a3); - vst1q_s16(coeff + 32, a4); - vst1q_s16(coeff + 40, a5); - vst1q_s16(coeff + 48, a6); - vst1q_s16(coeff + 56, a7); -} - -void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { - int i; - - /* Rearrange 16x16 to 8x32 and remove stride. - * Top left first. */ - aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); - /* Top right. */ - aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); - /* Bottom left. */ - aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); - /* Bottom right. */ - aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); - - for (i = 0; i < 64; i += 8) { - const int16x8_t a0 = vld1q_s16(coeff + 0); - const int16x8_t a1 = vld1q_s16(coeff + 64); - const int16x8_t a2 = vld1q_s16(coeff + 128); - const int16x8_t a3 = vld1q_s16(coeff + 192); - - const int16x8_t b0 = vhaddq_s16(a0, a1); - const int16x8_t b1 = vhsubq_s16(a0, a1); - const int16x8_t b2 = vhaddq_s16(a2, a3); - const int16x8_t b3 = vhsubq_s16(a2, a3); - - const int16x8_t c0 = vaddq_s16(b0, b2); - const int16x8_t c1 = vaddq_s16(b1, b3); - const int16x8_t c2 = vsubq_s16(b0, b2); - const int16x8_t c3 = vsubq_s16(b1, b3); - - vst1q_s16(coeff + 0, c0); - vst1q_s16(coeff + 64, c1); - vst1q_s16(coeff + 128, c2); - vst1q_s16(coeff + 192, c3); - - coeff += 8; - } -} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c deleted file mode 100644 index 196b2a890..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon.c +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "aom_dsp/inv_txfm.h" -#include "aom_ports/mem.h" - -void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, j, a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - for (d1 = d2 = dest, i = 0; i < 4; i++) { - for (j = 0; j < 2; j++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm deleted file mode 100644 index d01c4bc03..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_1_add_neon_asm.asm +++ /dev/null @@ -1,201 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - - EXPORT |aom_idct16x16_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct16x16_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asr r0, r0, #6 ; >> 6 - - vdup.s16 q0, r0 ; duplicate a1 - mov r0, #8 - sub r2, #8 - - ; load destination data row0 - row3 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row4 - row7 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row8 - row11 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row12 - row15 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |aom_idct16x16_1_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c deleted file mode 100644 index b4cb7a0cd..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_add_neon.c +++ /dev/null @@ -1,1295 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_config.h" -#include "aom_dsp/txfm_common.h" - -static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; -} - -void aom_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, - int output_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - // stage 3 - d0s16 = vdup_n_s16((int16_t)cospi_28_64); - d1s16 = vdup_n_s16((int16_t)cospi_4_64); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d18s16, d1s16); - q6s32 = vmull_s16(d19s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlal_s16(q5s32, d30s16, d0s16); - q6s32 = vmlal_s16(q6s32, d31s16, d0s16); - - d2s16 = vdup_n_s16((int16_t)cospi_12_64); - d3s16 = vdup_n_s16((int16_t)cospi_20_64); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q5s32, 14); - d15s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - q2s32 = vmull_s16(d26s16, d2s16); - q3s32 = vmull_s16(d27s16, d2s16); - q9s32 = vmull_s16(d26s16, d3s16); - q15s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q15s32 = vmlal_s16(q15s32, d23s16, d2s16); - - d10s16 = vqrshrn_n_s32(q2s32, 14); - d11s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q15s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - d30s16 = vdup_n_s16((int16_t)cospi_16_64); - - q2s32 = vmull_s16(d16s16, d30s16); - q11s32 = vmull_s16(d17s16, d30s16); - q0s32 = vmull_s16(d24s16, d30s16); - q1s32 = vmull_s16(d25s16, d30s16); - - d30s16 = vdup_n_s16((int16_t)cospi_24_64); - d31s16 = vdup_n_s16((int16_t)cospi_8_64); - - q3s32 = vaddq_s32(q2s32, q0s32); - q12s32 = vaddq_s32(q11s32, q1s32); - q13s32 = vsubq_s32(q2s32, q0s32); - q1s32 = vsubq_s32(q11s32, q1s32); - - d16s16 = vqrshrn_n_s32(q3s32, 14); - d17s16 = vqrshrn_n_s32(q12s32, 14); - d18s16 = vqrshrn_n_s32(q13s32, 14); - d19s16 = vqrshrn_n_s32(q1s32, 14); - q8s16 = vcombine_s16(d16s16, d17s16); - q9s16 = vcombine_s16(d18s16, d19s16); - - q0s32 = vmull_s16(d20s16, d31s16); - q1s32 = vmull_s16(d21s16, d31s16); - q12s32 = vmull_s16(d20s16, d30s16); - q13s32 = vmull_s16(d21s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d30s16); - q1s32 = vmlal_s16(q1s32, d29s16, d30s16); - q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); - - d22s16 = vqrshrn_n_s32(q0s32, 14); - d23s16 = vqrshrn_n_s32(q1s32, 14); - d20s16 = vqrshrn_n_s32(q12s32, 14); - d21s16 = vqrshrn_n_s32(q13s32, 14); - q10s16 = vcombine_s16(d20s16, d21s16); - q11s16 = vcombine_s16(d22s16, d23s16); - - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q15s16 = vaddq_s16(q6s16, q7s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - // stage 5 - q0s16 = vaddq_s16(q8s16, q11s16); - q1s16 = vaddq_s16(q9s16, q10s16); - q2s16 = vsubq_s16(q9s16, q10s16); - q3s16 = vsubq_s16(q8s16, q11s16); - - d16s16 = vdup_n_s16((int16_t)cospi_16_64); - - q11s32 = vmull_s16(d26s16, d16s16); - q12s32 = vmull_s16(d27s16, d16s16); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - - q6s32 = vsubq_s32(q9s32, q11s32); - q13s32 = vsubq_s32(q10s32, q12s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d10s16 = vqrshrn_n_s32(q6s32, 14); - d11s16 = vqrshrn_n_s32(q13s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q8s16 = vaddq_s16(q0s16, q15s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d16u64); - out += output_stride; - vst1_u64((uint64_t *)out, d17u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); - return; -} - -void aom_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride) { - uint8_t *d; - uint8x8_t d12u8, d13u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d24u64, d25u64, d26u64, d27u64; - int64x1_t d12s64, d13s64; - uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; - uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - // stage 3 - d12s16 = vdup_n_s16((int16_t)cospi_30_64); - d13s16 = vdup_n_s16((int16_t)cospi_2_64); - - q2s32 = vmull_s16(d16s16, d12s16); - q3s32 = vmull_s16(d17s16, d12s16); - q1s32 = vmull_s16(d16s16, d13s16); - q4s32 = vmull_s16(d17s16, d13s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); - q1s32 = vmlal_s16(q1s32, d30s16, d12s16); - q4s32 = vmlal_s16(q4s32, d31s16, d12s16); - - d0s16 = vqrshrn_n_s32(q2s32, 14); - d1s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q1s32, 14); - d15s16 = vqrshrn_n_s32(q4s32, 14); - q0s16 = vcombine_s16(d0s16, d1s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d30s16 = vdup_n_s16((int16_t)cospi_14_64); - d31s16 = vdup_n_s16((int16_t)cospi_18_64); - - q2s32 = vmull_s16(d24s16, d30s16); - q3s32 = vmull_s16(d25s16, d30s16); - q4s32 = vmull_s16(d24s16, d31s16); - q5s32 = vmull_s16(d25s16, d31s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); - q4s32 = vmlal_s16(q4s32, d22s16, d30s16); - q5s32 = vmlal_s16(q5s32, d23s16, d30s16); - - d2s16 = vqrshrn_n_s32(q2s32, 14); - d3s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q4s32, 14); - d13s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16((int16_t)cospi_22_64); - d31s16 = vdup_n_s16((int16_t)cospi_10_64); - - q11s32 = vmull_s16(d20s16, d30s16); - q12s32 = vmull_s16(d21s16, d30s16); - q4s32 = vmull_s16(d20s16, d31s16); - q5s32 = vmull_s16(d21s16, d31s16); - - q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); - q4s32 = vmlal_s16(q4s32, d26s16, d30s16); - q5s32 = vmlal_s16(q5s32, d27s16, d30s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d11s16 = vqrshrn_n_s32(q5s32, 14); - d10s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - d30s16 = vdup_n_s16((int16_t)cospi_6_64); - d31s16 = vdup_n_s16((int16_t)cospi_26_64); - - q10s32 = vmull_s16(d28s16, d30s16); - q11s32 = vmull_s16(d29s16, d30s16); - q12s32 = vmull_s16(d28s16, d31s16); - q13s32 = vmull_s16(d29s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); - q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); - q12s32 = vmlal_s16(q12s32, d18s16, d30s16); - q13s32 = vmlal_s16(q13s32, d19s16, d30s16); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q11s32, 14); - d8s16 = vqrshrn_n_s32(q12s32, 14); - d9s16 = vqrshrn_n_s32(q13s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 3 - q9s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q10s16 = vsubq_s16(q3s16, q2s16); - q11s16 = vaddq_s16(q2s16, q3s16); - q12s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q6s16, q7s16); - - // stage 4 - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d30s16 = vdup_n_s16((int16_t)cospi_8_64); - d31s16 = vdup_n_s16((int16_t)cospi_24_64); - - q2s32 = vmull_s16(d18s16, d31s16); - q3s32 = vmull_s16(d19s16, d31s16); - q4s32 = vmull_s16(d28s16, d31s16); - q5s32 = vmull_s16(d29s16, d31s16); - - q2s32 = vmlal_s16(q2s32, d28s16, d30s16); - q3s32 = vmlal_s16(q3s32, d29s16, d30s16); - q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); - - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q3s32, 14); - d2s16 = vqrshrn_n_s32(q4s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - q3s16 = q11s16; - q4s16 = q12s16; - - d30s16 = vdup_n_s16(-cospi_8_64); - q11s32 = vmull_s16(d26s16, d30s16); - q12s32 = vmull_s16(d27s16, d30s16); - q8s32 = vmull_s16(d20s16, d30s16); - q9s32 = vmull_s16(d21s16, d30s16); - - q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); - q8s32 = vmlal_s16(q8s32, d26s16, d31s16); - q9s32 = vmlal_s16(q9s32, d27s16, d31s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); - - // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16((int16_t)cospi_16_64); - - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q10s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q10s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 7 - if (skip_adding != 0) { - d = dest; - // load the data in pass1 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - // store the data out 8,9,10,11,12,13,14,15 - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q8s16 = vrshrq_n_s16(q8s16, 6); - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q9s16 = vrshrq_n_s16(q9s16, 6); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q2s16 = vrshrq_n_s16(q2s16, 6); - q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q3s16 = vrshrq_n_s16(q3s16, 6); - q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q4s16 = vrshrq_n_s16(q4s16, 6); - q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q5s16 = vrshrq_n_s16(q5s16, 6); - q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q14s16 = vrshrq_n_s16(q14s16, 6); - q14u16 = - vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - q15s16 = vrshrq_n_s16(q15s16, 6); - q15u16 = - vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - } else { // skip_adding_dest - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); - } - return; -} - -void aom_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out, - int output_stride) { - int16x4_t d4s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // stage 3 - q0s16 = vdupq_n_s16((int16_t)(cospi_28_64 * 2)); - q1s16 = vdupq_n_s16((int16_t)(cospi_4_64 * 2)); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - q7s16 = vqrdmulhq_s16(q9s16, q1s16); - - // stage 4 - q1s16 = vdupq_n_s16((int16_t)(cospi_16_64 * 2)); - d4s16 = vdup_n_s16((int16_t)cospi_16_64); - - q8s16 = vqrdmulhq_s16(q8s16, q1s16); - - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - q9s32 = vmull_s16(d14s16, d4s16); - q10s32 = vmull_s16(d15s16, d4s16); - q12s32 = vmull_s16(d9s16, d4s16); - q11s32 = vmull_s16(d8s16, d4s16); - - q15s32 = vsubq_s32(q10s32, q12s32); - q6s32 = vsubq_s32(q9s32, q11s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d11s16 = vqrshrn_n_s32(q15s32, 14); - d10s16 = vqrshrn_n_s32(q6s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 6 - q2s16 = vaddq_s16(q8s16, q7s16); - q9s16 = vaddq_s16(q8s16, q6s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q8s16, q4s16); - q12s16 = vsubq_s16(q8s16, q4s16); - q13s16 = vsubq_s16(q8s16, q5s16); - q14s16 = vsubq_s16(q8s16, q6s16); - q15s16 = vsubq_s16(q8s16, q7s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d4u64); - out += output_stride; - vst1_u64((uint64_t *)out, d5u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); - return; -} - -void aom_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; - uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; - uint64x1_t d16u64, d17u64, d18u64, d19u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - (void)skip_adding; - (void)dest; - (void)dest_stride; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // stage 3 - q6s16 = vdupq_n_s16((int16_t)(cospi_30_64 * 2)); - q0s16 = vqrdmulhq_s16(q8s16, q6s16); - q6s16 = vdupq_n_s16((int16_t)(cospi_2_64 * 2)); - q7s16 = vqrdmulhq_s16(q8s16, q6s16); - - q15s16 = vdupq_n_s16(-cospi_26_64 * 2); - q14s16 = vdupq_n_s16((int16_t)(cospi_6_64 * 2)); - q3s16 = vqrdmulhq_s16(q9s16, q15s16); - q4s16 = vqrdmulhq_s16(q9s16, q14s16); - - // stage 4 - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - d6s16 = vget_low_s16(q3s16); - d7s16 = vget_high_s16(q3s16); - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - - d30s16 = vdup_n_s16((int16_t)cospi_8_64); - d31s16 = vdup_n_s16((int16_t)cospi_24_64); - - q12s32 = vmull_s16(d14s16, d31s16); - q5s32 = vmull_s16(d15s16, d31s16); - q2s32 = vmull_s16(d0s16, d31s16); - q11s32 = vmull_s16(d1s16, d31s16); - - q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); - q2s32 = vmlal_s16(q2s32, d14s16, d30s16); - q11s32 = vmlal_s16(q11s32, d15s16, d30s16); - - d2s16 = vqrshrn_n_s32(q12s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q11s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16(-cospi_8_64); - q10s32 = vmull_s16(d8s16, d30s16); - q13s32 = vmull_s16(d9s16, d30s16); - q8s32 = vmull_s16(d6s16, d30s16); - q9s32 = vmull_s16(d7s16, d30s16); - - q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); - q8s32 = vmlal_s16(q8s32, d8s16, d31s16); - q9s32 = vmlal_s16(q9s32, d9s16, d31s16); - - d4s16 = vqrshrn_n_s32(q10s32, 14); - d5s16 = vqrshrn_n_s32(q13s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); - - // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16((int16_t)cospi_16_64); - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q0s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q0s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); - - // stage 7 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); - d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); - d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); - d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); - d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); - d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - vst1_u64((uint64_t *)out, d16u64); - out += 4; - vst1_u64((uint64_t *)out, d17u64); - out += 12; - vst1_u64((uint64_t *)out, d18u64); - out += 4; - vst1_u64((uint64_t *)out, d19u64); - out += 12; - vst1_u64((uint64_t *)out, d4u64); - out += 4; - vst1_u64((uint64_t *)out, d5u64); - out += 12; - vst1_u64((uint64_t *)out, d6u64); - out += 4; - vst1_u64((uint64_t *)out, d7u64); - out += 12; - vst1_u64((uint64_t *)out, d8u64); - out += 4; - vst1_u64((uint64_t *)out, d9u64); - out += 12; - vst1_u64((uint64_t *)out, d10u64); - out += 4; - vst1_u64((uint64_t *)out, d11u64); - out += 12; - vst1_u64((uint64_t *)out, d28u64); - out += 4; - vst1_u64((uint64_t *)out, d29u64); - out += 12; - vst1_u64((uint64_t *)out, d30u64); - out += 4; - vst1_u64((uint64_t *)out, d31u64); - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm deleted file mode 100644 index 4a8f8f183..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_add_neon_asm.asm +++ /dev/null @@ -1,1182 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_idct16x16_256_add_neon_pass1| - EXPORT |aom_idct16x16_256_add_neon_pass2| - EXPORT |aom_idct16x16_10_add_neon_pass1| - EXPORT |aom_idct16x16_10_add_neon_pass2| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void |aom_idct16x16_256_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) -; -; r0 int16_t input -; r1 int16_t *output -; r2 int output_stride) - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_256_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - - ; generate cospi_28_64 = 3196 - mov r3, #0xc00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r12, #0x3e00 - add r12, #0xc5 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r12 ; duplicate cospi_4_64 - - ; preloading to avoid stall - ; generate cospi_12_64 = 13623 - mov r3, #0x3500 - add r3, #0x37 - - ; generate cospi_20_64 = 9102 - mov r12, #0x2300 - add r12, #0x8e - - ; step2[4] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; step2[4] * cospi_4_64 - vmull.s16 q5, d18, d1 - vmull.s16 q6, d19, d1 - - ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 - vmlal.s16 q5, d30, d0 - vmlal.s16 q6, d31, d0 - - vdup.16 d2, r3 ; duplicate cospi_12_64 - vdup.16 d3, r12 ; duplicate cospi_20_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q5, #14 ; >> 14 - vqrshrn.s32 d15, q6, #14 ; >> 14 - - ; preloading to avoid stall - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - ; generate cospi_24_64 = 6270 - mov r12, #0x1800 - add r12, #0x7e - - ; step2[5] * cospi_12_64 - vmull.s16 q2, d26, d2 - vmull.s16 q3, d27, d2 - - ; step2[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q15, d27, d3 - - ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q2, d22, d3 - vmlsl.s16 q3, d23, d3 - - ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q15, d23, d2 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q2, #14 ; >> 14 - vqrshrn.s32 d11, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q15, #14 ; >> 14 - - ; stage 4 - vdup.16 d30, r3 ; cospi_16_64 - - ; step1[0] * cospi_16_64 - vmull.s16 q2, d16, d30 - vmull.s16 q11, d17, d30 - - ; step1[1] * cospi_16_64 - vmull.s16 q0, d24, d30 - vmull.s16 q1, d25, d30 - - ; generate cospi_8_64 = 15137 - mov r3, #0x3b00 - add r3, #0x21 - - vdup.16 d30, r12 ; duplicate cospi_24_64 - vdup.16 d31, r3 ; duplicate cospi_8_64 - - ; temp1 = (step1[0] + step1[1]) * cospi_16_64 - vadd.s32 q3, q2, q0 - vadd.s32 q12, q11, q1 - - ; temp2 = (step1[0] - step1[1]) * cospi_16_64 - vsub.s32 q13, q2, q0 - vsub.s32 q1, q11, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d16, q3, #14 ; >> 14 - vqrshrn.s32 d17, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d18, q13, #14 ; >> 14 - vqrshrn.s32 d19, q1, #14 ; >> 14 - - ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - ; step1[2] * cospi_8_64 - vmull.s16 q0, d20, d31 - vmull.s16 q1, d21, d31 - - ; step1[2] * cospi_24_64 - vmull.s16 q12, d20, d30 - vmull.s16 q13, d21, d30 - - ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q0, d28, d30 - vmlal.s16 q1, d29, d30 - - ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q12, d28, d31 - vmlsl.s16 q13, d29, d31 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d22, q0, #14 ; >> 14 - vqrshrn.s32 d23, q1, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d20, q12, #14 ; >> 14 - vqrshrn.s32 d21, q13, #14 ; >> 14 - - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; - vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; - - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - ; stage 5 - vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; - vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; - vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; - vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; - - vdup.16 d16, r3; ; duplicate cospi_16_64 - - ; step2[5] * cospi_16_64 - vmull.s16 q11, d26, d16 - vmull.s16 q12, d27, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q6, q9, q11 - vsub.s32 q13, q10, q12 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q6, #14 ; >> 14 - vqrshrn.s32 d11, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {d16}, [r1], r2 - vst1.64 {d17}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 - - bx lr - ENDP ; |aom_idct16x16_256_add_neon_pass1| - -;void aom_idct16x16_256_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) -; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_256_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - - ; generate cospi_30_64 = 1606 - mov r3, #0x0600 - add r3, #0x46 - - ; generate cospi_2_64 = 16305 - mov r12, #0x3f00 - add r12, #0xb1 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d12, r3 ; duplicate cospi_30_64 - vdup.16 d13, r12 ; duplicate cospi_2_64 - - ; preloading to avoid stall - ; generate cospi_14_64 = 12665 - mov r3, #0x3100 - add r3, #0x79 - - ; generate cospi_18_64 = 10394 - mov r12, #0x2800 - add r12, #0x9a - - ; step1[8] * cospi_30_64 - vmull.s16 q2, d16, d12 - vmull.s16 q3, d17, d12 - - ; step1[8] * cospi_2_64 - vmull.s16 q1, d16, d13 - vmull.s16 q4, d17, d13 - - ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 - vmlsl.s16 q2, d30, d13 - vmlsl.s16 q3, d31, d13 - - ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 - vmlal.s16 q1, d30, d12 - vmlal.s16 q4, d31, d12 - - vdup.16 d30, r3 ; duplicate cospi_14_64 - vdup.16 d31, r12 ; duplicate cospi_18_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d0, q2, #14 ; >> 14 - vqrshrn.s32 d1, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q1, #14 ; >> 14 - vqrshrn.s32 d15, q4, #14 ; >> 14 - - ; preloading to avoid stall - ; generate cospi_22_64 = 7723 - mov r3, #0x1e00 - add r3, #0x2b - - ; generate cospi_10_64 = 14449 - mov r12, #0x3800 - add r12, #0x71 - - ; step1[9] * cospi_14_64 - vmull.s16 q2, d24, d30 - vmull.s16 q3, d25, d30 - - ; step1[9] * cospi_18_64 - vmull.s16 q4, d24, d31 - vmull.s16 q5, d25, d31 - - ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 - vmlsl.s16 q2, d22, d31 - vmlsl.s16 q3, d23, d31 - - ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 - vmlal.s16 q4, d22, d30 - vmlal.s16 q5, d23, d30 - - vdup.16 d30, r3 ; duplicate cospi_22_64 - vdup.16 d31, r12 ; duplicate cospi_10_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q2, #14 ; >> 14 - vqrshrn.s32 d3, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q4, #14 ; >> 14 - vqrshrn.s32 d13, q5, #14 ; >> 14 - - ; step1[10] * cospi_22_64 - vmull.s16 q11, d20, d30 - vmull.s16 q12, d21, d30 - - ; step1[10] * cospi_10_64 - vmull.s16 q4, d20, d31 - vmull.s16 q5, d21, d31 - - ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 - vmlsl.s16 q11, d26, d31 - vmlsl.s16 q12, d27, d31 - - ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 - vmlal.s16 q4, d26, d30 - vmlal.s16 q5, d27, d30 - - ; preloading to avoid stall - ; generate cospi_6_64 = 15679 - mov r3, #0x3d00 - add r3, #0x3f - - ; generate cospi_26_64 = 4756 - mov r12, #0x1200 - add r12, #0x94 - - vdup.16 d30, r3 ; duplicate cospi_6_64 - vdup.16 d31, r12 ; duplicate cospi_26_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d11, q5, #14 ; >> 14 - vqrshrn.s32 d10, q4, #14 ; >> 14 - - ; step1[11] * cospi_6_64 - vmull.s16 q10, d28, d30 - vmull.s16 q11, d29, d30 - - ; step1[11] * cospi_26_64 - vmull.s16 q12, d28, d31 - vmull.s16 q13, d29, d31 - - ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 - vmlsl.s16 q10, d18, d31 - vmlsl.s16 q11, d19, d31 - - ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 - vmlal.s16 q12, d18, d30 - vmlal.s16 q13, d19, d30 - - vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] - vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q11, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q12, #14 ; >> 14 - vqrshrn.s32 d9, q13, #14 ; >> 14 - - ; stage 3 - vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] - vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] - vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] - vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] - vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] - vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] - - ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e - - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 - - ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vdup.16 d30, r12 ; duplicate cospi_8_64 - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d18, d31 - vmull.s16 q3, d19, d31 - - ; step1[14] * cospi_24_64 - vmull.s16 q4, d28, d31 - vmull.s16 q5, d29, d31 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d28, d30 - vmlal.s16 q3, d29, d30 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q4, d18, d30 - vmlsl.s16 q5, d19, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q4, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 - - vmov.s16 q3, q11 - vmov.s16 q4, q12 - - ; - step1[13] * cospi_8_64 - vmull.s16 q11, d26, d30 - vmull.s16 q12, d27, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d20, d30 - vmull.s16 q9, d21, d30 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlsl.s16 q11, d20, d31 - vmlsl.s16 q12, d21, d31 - - ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d26, d31 - vmlal.s16 q9, d27, d31 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q10, q3, q0 - vadd.s32 q4, q4, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q10, #14 ; >> 14 - vqrshrn.s32 d11, q4, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1Output stride - ldr r3, [sp] ; load skip_adding - cmp r3, #0 ; check if need adding dest data - beq skip_adding_dest - - ldr r7, [sp, #28] ; dest used to save element 0-7 - mov r9, r7 ; save dest pointer for later use - ldr r8, [sp, #32] ; load dest_stride - - ; stage 7 - ; load the data in pass1 - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - - ; store the data output 8,9,10,11,12,13,14,15 - vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q8 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q9, q9, #6 - vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q9 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q2, q2, #6 - vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q2 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q3, q3, #6 - vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q3 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q4, q4, #6 - vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q4 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q5, q5, #6 - vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q5 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q14, q14, #6 - vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] - vqmovun.s16 d12, q14 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q15, q15, #6 - vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] - vqmovun.s16 d13, q15 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - b end_idct16x16_pass2 - -skip_adding_dest - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |aom_idct16x16_256_add_neon_pass2| - -;void |aom_idct16x16_10_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) -; -; r0 int16_t input -; r1 int16_t *output -; r2 int output_stride) - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_10_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - - ; generate cospi_28_64*2 = 6392 - mov r3, #0x1800 - add r3, #0xf8 - - ; generate cospi_4_64*2 = 32138 - mov r12, #0x7d00 - add r12, #0x8a - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q0, r3 ; duplicate cospi_28_64*2 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, - ; double, and return the high 16 bits, effectively giving >> 15. Doubling - ; the constant will change this to >> 14. - ; dct_const_round_shift(step2[4] * cospi_28_64); - vqrdmulh.s16 q4, q9, q0 - - ; preloading to avoid stall - ; generate cospi_16_64*2 = 23170 - mov r3, #0x5a00 - add r3, #0x82 - - ; dct_const_round_shift(step2[4] * cospi_4_64); - vqrdmulh.s16 q7, q9, q1 - - ; stage 4 - vdup.16 q1, r3 ; cospi_16_64*2 - - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 - - vdup.16 d4, r3; ; duplicate cospi_16_64 - - ; dct_const_round_shift(step1[0] * cospi_16_64) - vqrdmulh.s16 q8, q8, q1 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d14, d4 - vmull.s16 q10, d15, d4 - - ; step2[5] * cospi_16_64 - vmull.s16 q12, d9, d4 - vmull.s16 q11, d8, d4 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q15, q10, q12 - vsub.s32 q6, q9, q11 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d11, q15, #14 ; >> 14 - vqrshrn.s32 d10, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; - vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; - vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {d4}, [r1], r2 - vst1.64 {d5}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 - - bx lr - ENDP ; |aom_idct16x16_10_add_neon_pass1| - -;void aom_idct16x16_10_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) -; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|aom_idct16x16_10_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - - ; generate 2*cospi_30_64 = 3212 - mov r3, #0xc00 - add r3, #0x8c - - ; generate 2*cospi_2_64 = 32610 - mov r12, #0x7f00 - add r12, #0x62 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q6, r3 ; duplicate 2*cospi_30_64 - - ; dct_const_round_shift(step1[8] * cospi_30_64) - vqrdmulh.s16 q0, q8, q6 - - vdup.16 q6, r12 ; duplicate 2*cospi_2_64 - - ; dct_const_round_shift(step1[8] * cospi_2_64) - vqrdmulh.s16 q7, q8, q6 - - ; preloading to avoid stall - ; generate 2*cospi_26_64 = 9512 - mov r12, #0x2500 - add r12, #0x28 - rsb r12, #0 - vdup.16 q15, r12 ; duplicate -2*cospi_26_64 - - ; generate 2*cospi_6_64 = 31358 - mov r3, #0x7a00 - add r3, #0x7e - vdup.16 q14, r3 ; duplicate 2*cospi_6_64 - - ; dct_const_round_shift(- step1[12] * cospi_26_64) - vqrdmulh.s16 q3, q9, q15 - - ; dct_const_round_shift(step1[12] * cospi_6_64) - vqrdmulh.s16 q4, q9, q14 - - ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 - vdup.16 d30, r12 ; duplicate cospi_8_64 - - ; step1[14] * cospi_24_64 - vmull.s16 q12, d14, d31 - vmull.s16 q5, d15, d31 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d0, d31 - vmull.s16 q11, d1, d31 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q12, d0, d30 - vmlsl.s16 q5, d1, d30 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d14, d30 - vmlal.s16 q11, d15, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q12, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q11, #14 ; >> 14 - - ; - step1[13] * cospi_8_64 - vmull.s16 q10, d8, d30 - vmull.s16 q13, d9, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d6, d30 - vmull.s16 q9, d7, d30 - - ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 - vmlsl.s16 q10, d6, d31 - vmlsl.s16 q13, d7, d31 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d8, d31 - vmlal.s16 q9, d9, d31 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q10, #14 ; >> 14 - vqrshrn.s32 d5, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q0, q3, q0 - vadd.s32 q1, q4, q1 - - ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q0, #14 ; >> 14 - vqrshrn.s32 d11, q1, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1Output stride - ldr r3, [sp] ; load skip_adding - - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct10_16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |aom_idct16x16_10_add_neon_pass2| - END diff --git a/third_party/aom/aom_dsp/arm/idct16x16_neon.c b/third_party/aom/aom_dsp/arm/idct16x16_neon.c deleted file mode 100644 index db0d4905b..000000000 --- a/third_party/aom/aom_dsp/arm/idct16x16_neon.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/aom_dsp_common.h" - -void aom_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output, - int output_stride); -void aom_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride); -void aom_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output, - int output_stride); -void aom_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride); - -#if HAVE_NEON_ASM -/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ -extern void aom_push_neon(int64_t *store); -extern void aom_pop_neon(int64_t *store); -#endif // HAVE_NEON_ASM - -void aom_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, - int dest_stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16 * 16] = { 0 }; - int16_t row_idct_output[16 * 16] = { 0 }; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - aom_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(input, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - aom_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, - dest, dest_stride); - - /* Parallel idct on the lower 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - aom_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8, - pass1_output, 0, dest, dest_stride); - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, dest_stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, - row_idct_output + 8, pass1_output, 1, - dest + 8, dest_stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - aom_pop_neon(store_reg); -#endif - - return; -} - -void aom_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, - int dest_stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16 * 16] = { 0 }; - int16_t row_idct_output[16 * 16] = { 0 }; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - aom_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_10_add_neon_pass1(input, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - aom_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, - dest, dest_stride); - - /* Skip Parallel idct on the lower 8 rows as they are all 0s */ - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - aom_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, dest_stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - aom_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - aom_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, - row_idct_output + 8, pass1_output, 1, - dest + 8, dest_stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - aom_pop_neon(store_reg); -#endif - - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c deleted file mode 100644 index 547567c5b..000000000 --- a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon.c +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_config.h" - -#include "aom_dsp/inv_txfm.h" -#include "aom_ports/mem.h" - -static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vld1q_u8(d); - d += d_stride; - *q9u8 = vld1q_u8(d); - d += d_stride; - *q10u8 = vld1q_u8(d); - d += d_stride; - *q11u8 = vld1q_u8(d); - d += d_stride; - *q12u8 = vld1q_u8(d); - d += d_stride; - *q13u8 = vld1q_u8(d); - d += d_stride; - *q14u8 = vld1q_u8(d); - d += d_stride; - *q15u8 = vld1q_u8(d); - return; -} - -static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqaddq_u8(*q8u8, qdiffu8); - *q9u8 = vqaddq_u8(*q9u8, qdiffu8); - *q10u8 = vqaddq_u8(*q10u8, qdiffu8); - *q11u8 = vqaddq_u8(*q11u8, qdiffu8); - *q12u8 = vqaddq_u8(*q12u8, qdiffu8); - *q13u8 = vqaddq_u8(*q13u8, qdiffu8); - *q14u8 = vqaddq_u8(*q14u8, qdiffu8); - *q15u8 = vqaddq_u8(*q15u8, qdiffu8); - return; -} - -static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqsubq_u8(*q8u8, qdiffu8); - *q9u8 = vqsubq_u8(*q9u8, qdiffu8); - *q10u8 = vqsubq_u8(*q10u8, qdiffu8); - *q11u8 = vqsubq_u8(*q11u8, qdiffu8); - *q12u8 = vqsubq_u8(*q12u8, qdiffu8); - *q13u8 = vqsubq_u8(*q13u8, qdiffu8); - *q14u8 = vqsubq_u8(*q14u8, qdiffu8); - *q15u8 = vqsubq_u8(*q15u8, qdiffu8); - return; -} - -static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - vst1q_u8(d, *q8u8); - d += d_stride; - vst1q_u8(d, *q9u8); - d += d_stride; - vst1q_u8(d, *q10u8); - d += d_stride; - vst1q_u8(d, *q11u8); - d += d_stride; - vst1q_u8(d, *q12u8); - d += d_stride; - vst1q_u8(d, *q13u8); - d += d_stride; - vst1q_u8(d, *q14u8); - d += d_stride; - vst1q_u8(d, *q15u8); - return; -} - -void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int i, j, dest_stride8; - uint8_t *d; - int16_t a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - dest_stride8 = dest_stride * 8; - if (a1 >= 0) { // diff_positive_32_32 - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - d += dest_stride8; - } - } - } else { // diff_negative_32_32 - a1 = -a1; - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - d += dest_stride8; - } - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm deleted file mode 100644 index b04df2d0b..000000000 --- a/third_party/aom/aom_dsp/arm/idct32x32_1_add_neon_asm.asm +++ /dev/null @@ -1,147 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - EXPORT |aom_idct32x32_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ;TODO(hkuang): put the following macros in a seperate - ;file so other idct function could also use them. - MACRO - LD_16x8 $src, $stride - vld1.8 {q8}, [$src], $stride - vld1.8 {q9}, [$src], $stride - vld1.8 {q10}, [$src], $stride - vld1.8 {q11}, [$src], $stride - vld1.8 {q12}, [$src], $stride - vld1.8 {q13}, [$src], $stride - vld1.8 {q14}, [$src], $stride - vld1.8 {q15}, [$src], $stride - MEND - - MACRO - ADD_DIFF_16x8 $diff - vqadd.u8 q8, q8, $diff - vqadd.u8 q9, q9, $diff - vqadd.u8 q10, q10, $diff - vqadd.u8 q11, q11, $diff - vqadd.u8 q12, q12, $diff - vqadd.u8 q13, q13, $diff - vqadd.u8 q14, q14, $diff - vqadd.u8 q15, q15, $diff - MEND - - MACRO - SUB_DIFF_16x8 $diff - vqsub.u8 q8, q8, $diff - vqsub.u8 q9, q9, $diff - vqsub.u8 q10, q10, $diff - vqsub.u8 q11, q11, $diff - vqsub.u8 q12, q12, $diff - vqsub.u8 q13, q13, $diff - vqsub.u8 q14, q14, $diff - vqsub.u8 q15, q15, $diff - MEND - - MACRO - ST_16x8 $dst, $stride - vst1.8 {q8}, [$dst], $stride - vst1.8 {q9}, [$dst], $stride - vst1.8 {q10},[$dst], $stride - vst1.8 {q11},[$dst], $stride - vst1.8 {q12},[$dst], $stride - vst1.8 {q13},[$dst], $stride - vst1.8 {q14},[$dst], $stride - vst1.8 {q15},[$dst], $stride - MEND - -;void aom_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride - -|aom_idct32x32_1_add_neon| PROC - push {lr} - pld [r1] - add r3, r1, #16 ; r3 dest + 16 for second loop - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asrs r0, r0, #6 ; >> 6 - bge diff_positive_32_32 - -diff_negative_32_32 - neg r0, r0 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_negative_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_negative_32_32_loop - pop {pc} - -diff_positive_32_32 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_positive_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_positive_32_32_loop - pop {pc} - - ENDP ; |aom_idct32x32_1_add_neon| - END diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c b/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c deleted file mode 100644 index a7562c7d5..000000000 --- a/third_party/aom/aom_dsp/arm/idct32x32_add_neon.c +++ /dev/null @@ -1,686 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_config.h" -#include "aom_dsp/txfm_common.h" - -#define LOAD_FROM_TRANSPOSED(prev, first, second) \ - q14s16 = vld1q_s16(trans_buf + first * 8); \ - q13s16 = vld1q_s16(trans_buf + second * 8); - -#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \ - qA = vld1q_s16(out + first * 32); \ - qB = vld1q_s16(out + second * 32); - -#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \ - vst1q_s16(out + first * 32, qA); \ - vst1q_s16(out + second * 32, qB); - -#define STORE_COMBINE_CENTER_RESULTS(r10, r9) \ - __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, q6s16, q7s16, q8s16, q9s16); -static INLINE void __STORE_COMBINE_CENTER_RESULTS(uint8_t *p1, uint8_t *p2, - int stride, int16x8_t q6s16, - int16x8_t q7s16, - int16x8_t q8s16, - int16x8_t q9s16) { - int16x4_t d8s16, d9s16, d10s16, d11s16; - - d8s16 = vld1_s16((int16_t *)p1); - p1 += stride; - d11s16 = vld1_s16((int16_t *)p2); - p2 -= stride; - d9s16 = vld1_s16((int16_t *)p1); - d10s16 = vld1_s16((int16_t *)p2); - - q7s16 = vrshrq_n_s16(q7s16, 6); - q8s16 = vrshrq_n_s16(q8s16, 6); - q9s16 = vrshrq_n_s16(q9s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); - - q7s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d9s16))); - q8s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s16(d10s16))); - q9s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s16(d11s16))); - q6s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d8s16))); - - d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16)); - d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16)); - d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); - - vst1_s16((int16_t *)p1, d9s16); - p1 -= stride; - vst1_s16((int16_t *)p2, d10s16); - p2 += stride; - vst1_s16((int16_t *)p1, d8s16); - vst1_s16((int16_t *)p2, d11s16); - return; -} - -#define STORE_COMBINE_EXTREME_RESULTS(r7, r6) \ - ; \ - __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, q4s16, q5s16, q6s16, q7s16); -static INLINE void __STORE_COMBINE_EXTREME_RESULTS(uint8_t *p1, uint8_t *p2, - int stride, int16x8_t q4s16, - int16x8_t q5s16, - int16x8_t q6s16, - int16x8_t q7s16) { - int16x4_t d4s16, d5s16, d6s16, d7s16; - - d4s16 = vld1_s16((int16_t *)p1); - p1 += stride; - d7s16 = vld1_s16((int16_t *)p2); - p2 -= stride; - d5s16 = vld1_s16((int16_t *)p1); - d6s16 = vld1_s16((int16_t *)p2); - - q5s16 = vrshrq_n_s16(q5s16, 6); - q6s16 = vrshrq_n_s16(q6s16, 6); - q7s16 = vrshrq_n_s16(q7s16, 6); - q4s16 = vrshrq_n_s16(q4s16, 6); - - q5s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s16(d5s16))); - q6s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q6s16), vreinterpret_u8_s16(d6s16))); - q7s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q7s16), vreinterpret_u8_s16(d7s16))); - q4s16 = vreinterpretq_s16_u16( - vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s16(d4s16))); - - d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16)); - d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16)); - d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16)); - d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16)); - - vst1_s16((int16_t *)p1, d5s16); - p1 -= stride; - vst1_s16((int16_t *)p2, d6s16); - p2 += stride; - vst1_s16((int16_t *)p2, d7s16); - vst1_s16((int16_t *)p1, d4s16); - return; -} - -#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \ - DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB); -static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16, - int16_t first_const, int16_t second_const, - int16x8_t *qAs16, int16x8_t *qBs16) { - int16x4_t d30s16, d31s16; - int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32; - int16x4_t dCs16, dDs16, dAs16, dBs16; - - dCs16 = vget_low_s16(q14s16); - dDs16 = vget_high_s16(q14s16); - dAs16 = vget_low_s16(q13s16); - dBs16 = vget_high_s16(q13s16); - - d30s16 = vdup_n_s16(first_const); - d31s16 = vdup_n_s16(second_const); - - q8s32 = vmull_s16(dCs16, d30s16); - q10s32 = vmull_s16(dAs16, d31s16); - q9s32 = vmull_s16(dDs16, d30s16); - q11s32 = vmull_s16(dBs16, d31s16); - q12s32 = vmull_s16(dCs16, d31s16); - - q8s32 = vsubq_s32(q8s32, q10s32); - q9s32 = vsubq_s32(q9s32, q11s32); - - q10s32 = vmull_s16(dDs16, d31s16); - q11s32 = vmull_s16(dAs16, d30s16); - q15s32 = vmull_s16(dBs16, d30s16); - - q11s32 = vaddq_s32(q12s32, q11s32); - q10s32 = vaddq_s32(q10s32, q15s32); - - *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14)); - *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14)); - return; -} - -static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) { - int16_t *in; - int i; - const int stride = 32; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - for (i = 0; i < 4; i++, input += 8) { - in = input; - q8s16 = vld1q_s16(in); - in += stride; - q9s16 = vld1q_s16(in); - in += stride; - q10s16 = vld1q_s16(in); - in += stride; - q11s16 = vld1q_s16(in); - in += stride; - q12s16 = vld1q_s16(in); - in += stride; - q13s16 = vld1q_s16(in); - in += stride; - q14s16 = vld1q_s16(in); - in += stride; - q15s16 = vld1q_s16(in); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - q12s16 = vcombine_s16(d17s16, d25s16); - q13s16 = vcombine_s16(d19s16, d27s16); - q14s16 = vcombine_s16(d21s16, d29s16); - q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16)); - q1x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16)); - q2x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16)); - q3x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - vst1q_s16(t_buf, q0x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q0x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q1x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q1x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q2x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q2x2s16.val[1]); - t_buf += 8; - vst1q_s16(t_buf, q3x2s16.val[0]); - t_buf += 8; - vst1q_s16(t_buf, q3x2s16.val[1]); - t_buf += 8; - } - return; -} - -static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16, - int16x8_t q3s16, int16x8_t q6s16, - int16x8_t q7s16, int16x8_t q8s16, - int16x8_t q9s16, int16x8_t q10s16, - int16x8_t q11s16, int16x8_t q12s16, - int16x8_t q13s16, int16x8_t q14s16, - int16x8_t q15s16) { - int16x8_t q0s16, q1s16, q4s16, q5s16; - - STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16); - STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16); - - LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16); - STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16); - - LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16); - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); - - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16); - STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16); - - LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16); - STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16); - - LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16); - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); - - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16); - STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16); - - LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16); - STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16); - - LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16); - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); - - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16); - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16); - STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16); - - LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16); - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16); - STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16); - return; -} - -static INLINE void idct32_bands_end_2nd_pass( - int16_t *out, uint8_t *dest, int stride, int16x8_t q2s16, int16x8_t q3s16, - int16x8_t q6s16, int16x8_t q7s16, int16x8_t q8s16, int16x8_t q9s16, - int16x8_t q10s16, int16x8_t q11s16, int16x8_t q12s16, int16x8_t q13s16, - int16x8_t q14s16, int16x8_t q15s16) { - uint8_t *r6 = dest + 31 * stride; - uint8_t *r7 = dest /* + 0 * stride*/; - uint8_t *r9 = dest + 15 * stride; - uint8_t *r10 = dest + 16 * stride; - int str2 = stride << 1; - int16x8_t q0s16, q1s16, q4s16, q5s16; - - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; - - LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; - - LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16) - q2s16 = vaddq_s16(q10s16, q1s16); - q3s16 = vaddq_s16(q11s16, q0s16); - q4s16 = vsubq_s16(q11s16, q0s16); - q5s16 = vsubq_s16(q10s16, q1s16); - - LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; - - LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; - - LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16) - q2s16 = vaddq_s16(q12s16, q1s16); - q3s16 = vaddq_s16(q13s16, q0s16); - q4s16 = vsubq_s16(q13s16, q0s16); - q5s16 = vsubq_s16(q12s16, q1s16); - - LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - r10 += str2; - r9 -= str2; - - LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - r7 += str2; - r6 -= str2; - - LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16) - q2s16 = vaddq_s16(q14s16, q1s16); - q3s16 = vaddq_s16(q15s16, q0s16); - q4s16 = vsubq_s16(q15s16, q0s16); - q5s16 = vsubq_s16(q14s16, q1s16); - - LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - STORE_COMBINE_CENTER_RESULTS(r10, r9); - - LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16) - q4s16 = vaddq_s16(q2s16, q1s16); - q5s16 = vaddq_s16(q3s16, q0s16); - q6s16 = vsubq_s16(q3s16, q0s16); - q7s16 = vsubq_s16(q2s16, q1s16); - STORE_COMBINE_EXTREME_RESULTS(r7, r6); - return; -} - -void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int stride) { - int i, idct32_pass_loop; - int16_t trans_buf[32 * 8]; - int16_t pass1[32 * 32]; - int16_t pass2[32 * 32]; - int16_t *out; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - - for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; - idct32_pass_loop++, - input = pass1, // the input of pass2 is the result of pass1 - out = pass2) { - for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop - idct32_transpose_pair(input, trans_buf); - - // ----------------------------------------- - // BLOCK A: 16-19,28-31 - // ----------------------------------------- - // generate 16,17,30,31 - // part of stage 1 - LOAD_FROM_TRANSPOSED(0, 1, 31) - DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(31, 17, 15) - DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16) - // part of stage 2 - q4s16 = vaddq_s16(q0s16, q1s16); - q13s16 = vsubq_s16(q0s16, q1s16); - q6s16 = vaddq_s16(q2s16, q3s16); - q14s16 = vsubq_s16(q2s16, q3s16); - // part of stage 3 - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16) - - // generate 18,19,28,29 - // part of stage 1 - LOAD_FROM_TRANSPOSED(15, 9, 23) - DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(23, 25, 7) - DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16) - // part of stage 2 - q13s16 = vsubq_s16(q3s16, q2s16); - q3s16 = vaddq_s16(q3s16, q2s16); - q14s16 = vsubq_s16(q1s16, q0s16); - q2s16 = vaddq_s16(q1s16, q0s16); - // part of stage 3 - DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16) - // part of stage 4 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q10s16 = vaddq_s16(q7s16, q1s16); - q15s16 = vaddq_s16(q6s16, q3s16); - q13s16 = vsubq_s16(q5s16, q0s16); - q14s16 = vsubq_s16(q7s16, q1s16); - STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16) - STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16) - // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16) - STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16) - // part of stage 4 - q13s16 = vsubq_s16(q4s16, q2s16); - q14s16 = vsubq_s16(q6s16, q3s16); - // part of stage 5 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16) - STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16) - - // ----------------------------------------- - // BLOCK B: 20-23,24-27 - // ----------------------------------------- - // generate 20,21,26,27 - // part of stage 1 - LOAD_FROM_TRANSPOSED(7, 5, 27) - DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(27, 21, 11) - DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16) - // part of stage 2 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 3 - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) - - // generate 22,23,24,25 - // part of stage 1 - LOAD_FROM_TRANSPOSED(11, 13, 19) - DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(19, 29, 3) - DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16) - // part of stage 2 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); - // part of stage 3 - DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16) - // part of stage 4 - q10s16 = vaddq_s16(q7s16, q1s16); - q11s16 = vaddq_s16(q5s16, q0s16); - q12s16 = vaddq_s16(q6s16, q2s16); - q15s16 = vaddq_s16(q4s16, q3s16); - // part of stage 6 - LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q11s16); - q9s16 = vaddq_s16(q13s16, q10s16); - q13s16 = vsubq_s16(q13s16, q10s16); - q11s16 = vsubq_s16(q14s16, q11s16); - STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16) - LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16) - q8s16 = vsubq_s16(q9s16, q12s16); - q10s16 = vaddq_s16(q14s16, q15s16); - q14s16 = vsubq_s16(q14s16, q15s16); - q12s16 = vaddq_s16(q9s16, q12s16); - STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16) - // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16) - q13s16 = q11s16; - q14s16 = q8s16; - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16) - // part of stage 4 - q14s16 = vsubq_s16(q5s16, q0s16); - q13s16 = vsubq_s16(q6s16, q2s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16); - q14s16 = vsubq_s16(q7s16, q1s16); - q13s16 = vsubq_s16(q4s16, q3s16); - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16); - // part of stage 6 - LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16) - q8s16 = vaddq_s16(q14s16, q1s16); - q9s16 = vaddq_s16(q13s16, q6s16); - q13s16 = vsubq_s16(q13s16, q6s16); - q1s16 = vsubq_s16(q14s16, q1s16); - STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16) - LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16) - q14s16 = vsubq_s16(q8s16, q5s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q9s16, q0s16); - q0s16 = vsubq_s16(q9s16, q0s16); - STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16) - // part of stage 7 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16) - STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16) - DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64, &q1s16, &q0s16); - STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16) - - // ----------------------------------------- - // BLOCK C: 8-10,11-15 - // ----------------------------------------- - // generate 8,9,14,15 - // part of stage 2 - LOAD_FROM_TRANSPOSED(3, 2, 30) - DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(30, 18, 14) - DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16) - // part of stage 3 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 4 - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16) - - // generate 10,11,12,13 - // part of stage 2 - LOAD_FROM_TRANSPOSED(14, 10, 22) - DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(22, 26, 6) - DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16) - // part of stage 3 - q14s16 = vsubq_s16(q4s16, q5s16); - q5s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q6s16, q7s16); - q6s16 = vaddq_s16(q6s16, q7s16); - // part of stage 4 - DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16) - // part of stage 5 - q8s16 = vaddq_s16(q0s16, q5s16); - q9s16 = vaddq_s16(q1s16, q7s16); - q13s16 = vsubq_s16(q1s16, q7s16); - q14s16 = vsubq_s16(q3s16, q4s16); - q10s16 = vaddq_s16(q3s16, q4s16); - q15s16 = vaddq_s16(q2s16, q6s16); - STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16) - STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16) - // part of stage 6 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16) - q13s16 = vsubq_s16(q0s16, q5s16); - q14s16 = vsubq_s16(q2s16, q6s16); - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16) - - // ----------------------------------------- - // BLOCK D: 0-3,4-7 - // ----------------------------------------- - // generate 4,5,6,7 - // part of stage 3 - LOAD_FROM_TRANSPOSED(6, 4, 28) - DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16) - LOAD_FROM_TRANSPOSED(28, 20, 12) - DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16) - // part of stage 4 - q13s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q14s16 = vsubq_s16(q2s16, q3s16); - q2s16 = vaddq_s16(q2s16, q3s16); - // part of stage 5 - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16) - - // generate 0,1,2,3 - // part of stage 4 - LOAD_FROM_TRANSPOSED(12, 0, 16) - DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16) - LOAD_FROM_TRANSPOSED(16, 8, 24) - DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16) - // part of stage 5 - q4s16 = vaddq_s16(q7s16, q6s16); - q7s16 = vsubq_s16(q7s16, q6s16); - q6s16 = vsubq_s16(q5s16, q14s16); - q5s16 = vaddq_s16(q5s16, q14s16); - // part of stage 6 - q8s16 = vaddq_s16(q4s16, q2s16); - q9s16 = vaddq_s16(q5s16, q3s16); - q10s16 = vaddq_s16(q6s16, q1s16); - q11s16 = vaddq_s16(q7s16, q0s16); - q12s16 = vsubq_s16(q7s16, q0s16); - q13s16 = vsubq_s16(q6s16, q1s16); - q14s16 = vsubq_s16(q5s16, q3s16); - q15s16 = vsubq_s16(q4s16, q2s16); - // part of stage 7 - LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16) - q2s16 = vaddq_s16(q8s16, q1s16); - q3s16 = vaddq_s16(q9s16, q0s16); - q4s16 = vsubq_s16(q9s16, q0s16); - q5s16 = vsubq_s16(q8s16, q1s16); - LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16) - q8s16 = vaddq_s16(q4s16, q1s16); - q9s16 = vaddq_s16(q5s16, q0s16); - q6s16 = vsubq_s16(q5s16, q0s16); - q7s16 = vsubq_s16(q4s16, q1s16); - - if (idct32_pass_loop == 0) { - idct32_bands_end_1st_pass(out, q2s16, q3s16, q6s16, q7s16, q8s16, q9s16, - q10s16, q11s16, q12s16, q13s16, q14s16, - q15s16); - } else { - idct32_bands_end_2nd_pass(out, dest, stride, q2s16, q3s16, q6s16, q7s16, - q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, - q14s16, q15s16); - dest += 8; - } - } - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm deleted file mode 100644 index e7793fb16..000000000 --- a/third_party/aom/aom_dsp/arm/idct32x32_add_neon_asm.asm +++ /dev/null @@ -1,1302 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -;TODO(cd): adjust these constant to be able to use vqdmulh for faster -; dct_const_round_shift(a * b) within butterfly calculations. -cospi_1_64 EQU 16364 -cospi_2_64 EQU 16305 -cospi_3_64 EQU 16207 -cospi_4_64 EQU 16069 -cospi_5_64 EQU 15893 -cospi_6_64 EQU 15679 -cospi_7_64 EQU 15426 -cospi_8_64 EQU 15137 -cospi_9_64 EQU 14811 -cospi_10_64 EQU 14449 -cospi_11_64 EQU 14053 -cospi_12_64 EQU 13623 -cospi_13_64 EQU 13160 -cospi_14_64 EQU 12665 -cospi_15_64 EQU 12140 -cospi_16_64 EQU 11585 -cospi_17_64 EQU 11003 -cospi_18_64 EQU 10394 -cospi_19_64 EQU 9760 -cospi_20_64 EQU 9102 -cospi_21_64 EQU 8423 -cospi_22_64 EQU 7723 -cospi_23_64 EQU 7005 -cospi_24_64 EQU 6270 -cospi_25_64 EQU 5520 -cospi_26_64 EQU 4756 -cospi_27_64 EQU 3981 -cospi_28_64 EQU 3196 -cospi_29_64 EQU 2404 -cospi_30_64 EQU 1606 -cospi_31_64 EQU 804 - - - EXPORT |aom_idct32x32_1024_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - AREA Block, CODE, READONLY - - ; -------------------------------------------------------------------------- - ; Load from transposed_buffer - ; q13 = transposed_buffer[first_offset] - ; q14 = transposed_buffer[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; transposed_buffer must be passed in. use 0 for first use. - MACRO - LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset - ; address calculation with proper stride and loading - add r0, #($first_offset - $prev_offset )*8*2 - vld1.s16 {q14}, [r0] - add r0, #($second_offset - $first_offset)*8*2 - vld1.s16 {q13}, [r0] - ; (used) two registers (q14, q13) - MEND - ; -------------------------------------------------------------------------- - ; Load from output (used as temporary storage) - ; reg1 = output[first_offset] - ; reg2 = output[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and loading - add r1, #($first_offset - $prev_offset )*32*2 - vld1.s16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vld1.s16 {$reg2}, [r1] - ; (used) two registers ($reg1, $reg2) - MEND - ; -------------------------------------------------------------------------- - ; Store into output (sometimes as as temporary storage) - ; output[first_offset] = reg1 - ; output[second_offset] = reg2 - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and storing - add r1, #($first_offset - $prev_offset )*32*2 - vst1.16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vst1.16 {$reg2}, [r1] - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10] - vst1.16 {d11}, [r9] - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10]! - vst1.16 {d11}, [r9]! - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6] - vst1.16 {d4}, [r7] - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6]! - vst1.16 {d4}, [r7]! - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - ; TODO(cd): have special case to re-use constants when they are similar for - ; consecutive butterflies - ; TODO(cd): have special case when both constants are the same, do the - ; additions/subtractions before the multiplies. - ; generate the constants - ; generate scalar constants - mov r8, #$first_constant & 0xFF00 - mov r12, #$second_constant & 0xFF00 - add r8, #$first_constant & 0x00FF - add r12, #$second_constant & 0x00FF - ; generate vector constants - vdup.16 d30, r8 - vdup.16 d31, r12 - ; (used) two for inputs (regA-regD), one for constants (q15) - ; do some multiplications (ordered for maximum latency hiding) - vmull.s16 q8, $regC, d30 - vmull.s16 q10, $regA, d31 - vmull.s16 q9, $regD, d30 - vmull.s16 q11, $regB, d31 - vmull.s16 q12, $regC, d31 - ; (used) five for intermediate (q8-q12), one for constants (q15) - ; do some addition/subtractions (to get back two register) - vsub.s32 q8, q8, q10 - vsub.s32 q9, q9, q11 - ; do more multiplications (ordered for maximum latency hiding) - vmull.s16 q10, $regD, d31 - vmull.s16 q11, $regA, d30 - vmull.s16 q15, $regB, d30 - ; (used) six for intermediate (q8-q12, q15) - ; do more addition/subtractions - vadd.s32 q11, q12, q11 - vadd.s32 q10, q10, q15 - ; (used) four for intermediate (q8-q11) - ; dct_const_round_shift - vqrshrn.s32 $reg1, q8, #14 - vqrshrn.s32 $reg2, q9, #14 - vqrshrn.s32 $reg3, q11, #14 - vqrshrn.s32 $reg4, q10, #14 - ; (used) two for results, well four d registers - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - MEND - ; -------------------------------------------------------------------------- - -;void aom_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); -; -; r0 int16_t *input, -; r1 uint8_t *dest, -; r2 int dest_stride) -; loop counters -; r4 bands loop counter -; r5 pass loop counter -; r8 transpose loop counter -; combine-add pointers -; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) -; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) -; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) -; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) - -|aom_idct32x32_1024_add_neon| PROC - ; This function does one pass of idct32x32 transform. - ; - ; This is done by transposing the input and then doing a 1d transform on - ; columns. In the first pass, the transposed columns are the original - ; rows. In the second pass, after the transposition, the colums are the - ; original columns. - ; The 1d transform is done by looping over bands of eight columns (the - ; idct32_bands loop). For each band, the transform input transposition - ; is done on demand, one band of four 8x8 matrices at a time. The four - ; matrices are transposed by pairs (the idct32_transpose_pair loop). - push {r4-r11} - vpush {d8-d15} - ; stack operation - ; internal buffer used to transpose 8 lines into before transforming them - ; int16_t transpose_buffer[32 * 8]; - ; at sp + [4096, 4607] - ; results of the first pass (transpose and transform rows) - ; int16_t pass1[32 * 32]; - ; at sp + [0, 2047] - ; results of the second pass (transpose and transform columns) - ; int16_t pass2[32 * 32]; - ; at sp + [2048, 4095] - sub sp, sp, #512+2048+2048 - - ; r6 = dest + 31 * dest_stride - ; r7 = dest + 0 * dest_stride - ; r9 = dest + 15 * dest_stride - ; r10 = dest + 16 * dest_stride - rsb r6, r2, r2, lsl #5 - rsb r9, r2, r2, lsl #4 - add r10, r1, r2, lsl #4 - mov r7, r1 - add r6, r6, r1 - add r9, r9, r1 - ; r11 = -dest_stride - neg r11, r2 - ; r3 = input - mov r3, r0 - ; parameters for first pass - ; r0 = transpose_buffer[32 * 8] - add r0, sp, #4096 - ; r1 = pass1[32 * 32] - mov r1, sp - - mov r5, #0 ; initialize pass loop counter -idct32_pass_loop - mov r4, #4 ; initialize bands loop counter -idct32_bands_loop - mov r8, #2 ; initialize transpose loop counter -idct32_transpose_pair_loop - ; Load two horizontally consecutive 8x8 16bit data matrices. The first one - ; into q0-q7 and the second one into q8-q15. There is a stride of 64, - ; adjusted to 32 because of the two post-increments. - vld1.s16 {q8}, [r3]! - vld1.s16 {q0}, [r3]! - add r3, #32 - vld1.s16 {q9}, [r3]! - vld1.s16 {q1}, [r3]! - add r3, #32 - vld1.s16 {q10}, [r3]! - vld1.s16 {q2}, [r3]! - add r3, #32 - vld1.s16 {q11}, [r3]! - vld1.s16 {q3}, [r3]! - add r3, #32 - vld1.s16 {q12}, [r3]! - vld1.s16 {q4}, [r3]! - add r3, #32 - vld1.s16 {q13}, [r3]! - vld1.s16 {q5}, [r3]! - add r3, #32 - vld1.s16 {q14}, [r3]! - vld1.s16 {q6}, [r3]! - add r3, #32 - vld1.s16 {q15}, [r3]! - vld1.s16 {q7}, [r3]! - - ; Transpose the two 8x8 16bit data matrices. - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vswp d1, d8 - vswp d7, d14 - vswp d5, d12 - vswp d3, d10 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.32 q0, q2 - vtrn.32 q1, q3 - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; Store both matrices after each other. There is a stride of 32, which - ; adjusts to nothing because of the post-increments. - vst1.16 {q8}, [r0]! - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! - vst1.16 {q0}, [r0]! - vst1.16 {q1}, [r0]! - vst1.16 {q2}, [r0]! - vst1.16 {q3}, [r0]! - vst1.16 {q4}, [r0]! - vst1.16 {q5}, [r0]! - vst1.16 {q6}, [r0]! - vst1.16 {q7}, [r0]! - - ; increment pointers by adjusted stride (not necessary for r0/out) - ; go back by 7*32 for the seven lines moved fully by read and add - ; go back by 32 for the eigth line only read - ; advance by 16*2 to go the next pair - sub r3, r3, #7*32*2 + 32 - 16*2 - ; transpose pair loop processing - subs r8, r8, #1 - bne idct32_transpose_pair_loop - - ; restore r0/input to its original value - sub r0, r0, #32*8*2 - - ; Instead of doing the transforms stage by stage, it is done by loading - ; some input values and doing as many stages as possible to minimize the - ; storing/loading of intermediate results. To fit within registers, the - ; final coefficients are cut into four blocks: - ; BLOCK A: 16-19,28-31 - ; BLOCK B: 20-23,24-27 - ; BLOCK C: 8-10,11-15 - ; BLOCK D: 0-3,4-7 - ; Blocks A and C are straight calculation through the various stages. In - ; block B, further calculations are performed using the results from - ; block A. In block D, further calculations are performed using the results - ; from block C and then the final calculations are done using results from - ; block A and B which have been combined at the end of block B. - - ; -------------------------------------------------------------------------- - ; BLOCK A: 16-19,28-31 - ; -------------------------------------------------------------------------- - ; generate 16,17,30,31 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; - ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; - ;step1b[16][i] = dct_const_round_shift(temp1); - ;step1b[31][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 0, 1, 31 - DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; - ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; - ;step1b[17][i] = dct_const_round_shift(temp1); - ;step1b[30][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 31, 17, 15 - DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[16] = step1b[16][i] + step1b[17][i]; - ;step2[17] = step1b[16][i] - step1b[17][i]; - ;step2[30] = -step1b[30][i] + step1b[31][i]; - ;step2[31] = step1b[30][i] + step1b[31][i]; - vadd.s16 q4, q0, q1 - vsub.s16 q13, q0, q1 - vadd.s16 q6, q2, q3 - vsub.s16 q14, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; - ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; - ;step3[17] = dct_const_round_shift(temp1); - ;step3[30] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; generate 18,19,28,29 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; - ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; - ;step1b[18][i] = dct_const_round_shift(temp1); - ;step1b[29][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 15, 9, 23 - DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; - ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; - ;step1b[19][i] = dct_const_round_shift(temp1); - ;step1b[28][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 23, 25, 7 - DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[18] = -step1b[18][i] + step1b[19][i]; - ;step2[19] = step1b[18][i] + step1b[19][i]; - ;step2[28] = step1b[28][i] + step1b[29][i]; - ;step2[29] = step1b[28][i] - step1b[29][i]; - vsub.s16 q13, q3, q2 - vadd.s16 q3, q3, q2 - vsub.s16 q14, q1, q0 - vadd.s16 q2, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); - ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); - ;step3[29] = dct_const_round_shift(temp1); - ;step3[18] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 - ; -------------------------------------------------------------------------- - ; combine 16-19,28-31 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[16] = step1b[16][i] + step1b[19][i]; - ;step1[17] = step1b[17][i] + step1b[18][i]; - ;step1[18] = step1b[17][i] - step1b[18][i]; - ;step1[29] = step1b[30][i] - step1b[29][i]; - ;step1[30] = step1b[30][i] + step1b[29][i]; - ;step1[31] = step1b[31][i] + step1b[28][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q0 - vadd.s16 q10, q7, q1 - vadd.s16 q15, q6, q3 - vsub.s16 q13, q5, q0 - vsub.s16 q14, q7, q1 - STORE_IN_OUTPUT 0, 16, 31, q8, q15 - STORE_IN_OUTPUT 31, 17, 30, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; - ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; - ;step2[18] = dct_const_round_shift(temp1); - ;step2[29] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 - STORE_IN_OUTPUT 30, 29, 18, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[19] = step1b[16][i] - step1b[19][i]; - ;step1[28] = step1b[31][i] - step1b[28][i]; - vsub.s16 q13, q4, q2 - vsub.s16 q14, q6, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; - ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; - ;step2[19] = dct_const_round_shift(temp1); - ;step2[28] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 - STORE_IN_OUTPUT 18, 19, 28, q4, q6 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK B: 20-23,24-27 - ; -------------------------------------------------------------------------- - ; generate 20,21,26,27 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; - ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; - ;step1b[20][i] = dct_const_round_shift(temp1); - ;step1b[27][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 7, 5, 27 - DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; - ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; - ;step1b[21][i] = dct_const_round_shift(temp1); - ;step1b[26][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 27, 21, 11 - DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[20] = step1b[20][i] + step1b[21][i]; - ;step2[21] = step1b[20][i] - step1b[21][i]; - ;step2[26] = -step1b[26][i] + step1b[27][i]; - ;step2[27] = step1b[26][i] + step1b[27][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; - ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; - ;step3[21] = dct_const_round_shift(temp1); - ;step3[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 22,23,24,25 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; - ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; - ;step1b[22][i] = dct_const_round_shift(temp1); - ;step1b[25][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 11, 13, 19 - DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; - ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; - ;step1b[23][i] = dct_const_round_shift(temp1); - ;step1b[24][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 19, 29, 3 - DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[22] = -step1b[22][i] + step1b[23][i]; - ;step2[23] = step1b[22][i] + step1b[23][i]; - ;step2[24] = step1b[24][i] + step1b[25][i]; - ;step2[25] = step1b[24][i] - step1b[25][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); - ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); - ;step3[25] = dct_const_round_shift(temp1); - ;step3[22] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 20-23,24-27 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[22] = step1b[22][i] + step1b[21][i]; - ;step1[23] = step1b[23][i] + step1b[20][i]; - vadd.s16 q10, q7, q1 - vadd.s16 q11, q5, q0 - ;step1[24] = step1b[24][i] + step1b[27][i]; - ;step1[25] = step1b[25][i] + step1b[26][i]; - vadd.s16 q12, q6, q2 - vadd.s16 q15, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[16] = step1b[16][i] + step1b[23][i]; - ;step3[17] = step1b[17][i] + step1b[22][i]; - ;step3[22] = step1b[17][i] - step1b[22][i]; - ;step3[23] = step1b[16][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 - vadd.s16 q8, q14, q11 - vadd.s16 q9, q13, q10 - vsub.s16 q13, q13, q10 - vsub.s16 q11, q14, q11 - STORE_IN_OUTPUT 17, 17, 16, q9, q8 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[24] = step1b[31][i] - step1b[24][i]; - ;step3[25] = step1b[30][i] - step1b[25][i]; - ;step3[30] = step1b[30][i] + step1b[25][i]; - ;step3[31] = step1b[31][i] + step1b[24][i]; - LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 - vsub.s16 q8, q9, q12 - vadd.s16 q10, q14, q15 - vsub.s16 q14, q14, q15 - vadd.s16 q12, q9, q12 - STORE_IN_OUTPUT 31, 30, 31, q10, q12 - ; -------------------------------------------------------------------------- - ; TODO(cd) do some register allocation change to remove these push/pop - vpush {q8} ; [24] - vpush {q11} ; [23] - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; - ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; - ;step1[22] = dct_const_round_shift(temp1); - ;step1[25] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 31, 25, 22, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; - ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; - ;step1[23] = dct_const_round_shift(temp1); - ;step1[24] = dct_const_round_shift(temp2); - ; TODO(cd) do some register allocation change to remove these push/pop - vpop {q13} ; [23] - vpop {q14} ; [24] - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 22, 24, 23, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[20] = step1b[23][i] - step1b[20][i]; - ;step1[27] = step1b[24][i] - step1b[27][i]; - vsub.s16 q14, q5, q0 - vsub.s16 q13, q6, q2 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); - ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); - ;step2[27] = dct_const_round_shift(temp1); - ;step2[20] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[21] = step1b[22][i] - step1b[21][i]; - ;step1[26] = step1b[25][i] - step1b[26][i]; - vsub.s16 q14, q7, q1 - vsub.s16 q13, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); - ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); - ;step2[26] = dct_const_round_shift(temp1); - ;step2[21] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[18] = step1b[18][i] + step1b[21][i]; - ;step3[19] = step1b[19][i] + step1b[20][i]; - ;step3[20] = step1b[19][i] - step1b[20][i]; - ;step3[21] = step1b[18][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 - vadd.s16 q8, q14, q1 - vadd.s16 q9, q13, q6 - vsub.s16 q13, q13, q6 - vsub.s16 q1, q14, q1 - STORE_IN_OUTPUT 19, 18, 19, q8, q9 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[27] = step1b[28][i] - step1b[27][i]; - ;step3[28] = step1b[28][i] + step1b[27][i]; - ;step3[29] = step1b[29][i] + step1b[26][i]; - ;step3[26] = step1b[29][i] - step1b[26][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 - vsub.s16 q14, q8, q5 - vadd.s16 q10, q8, q5 - vadd.s16 q11, q9, q0 - vsub.s16 q0, q9, q0 - STORE_IN_OUTPUT 29, 28, 29, q10, q11 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; - ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; - ;step1[20] = dct_const_round_shift(temp1); - ;step1[27] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 29, 20, 27, q13, q14 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; - ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; - ;step1[21] = dct_const_round_shift(temp1); - ;step1[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 - STORE_IN_OUTPUT 27, 21, 26, q1, q0 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK C: 8-10,11-15 - ; -------------------------------------------------------------------------- - ; generate 8,9,14,15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; - ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; - ;step2[8] = dct_const_round_shift(temp1); - ;step2[15] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 3, 2, 30 - DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; - ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; - ;step2[9] = dct_const_round_shift(temp1); - ;step2[14] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 30, 18, 14 - DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[8] = step1b[8][i] + step1b[9][i]; - ;step3[9] = step1b[8][i] - step1b[9][i]; - ;step3[14] = step1b[15][i] - step1b[14][i]; - ;step3[15] = step1b[15][i] + step1b[14][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; - ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; - ;step1[9] = dct_const_round_shift(temp1); - ;step1[14] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 10,11,12,13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; - ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; - ;step2[10] = dct_const_round_shift(temp1); - ;step2[13] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 14, 10, 22 - DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; - ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; - ;step2[11] = dct_const_round_shift(temp1); - ;step2[12] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 22, 26, 6 - DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[10] = step1b[11][i] - step1b[10][i]; - ;step3[11] = step1b[11][i] + step1b[10][i]; - ;step3[12] = step1b[12][i] + step1b[13][i]; - ;step3[13] = step1b[12][i] - step1b[13][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); - ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); - ;step1[13] = dct_const_round_shift(temp1); - ;step1[10] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 8-10,11-15 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[8] = step1b[8][i] + step1b[11][i]; - ;step2[9] = step1b[9][i] + step1b[10][i]; - ;step2[10] = step1b[9][i] - step1b[10][i]; - vadd.s16 q8, q0, q5 - vadd.s16 q9, q1, q7 - vsub.s16 q13, q1, q7 - ;step2[13] = step1b[14][i] - step1b[13][i]; - ;step2[14] = step1b[14][i] + step1b[13][i]; - ;step2[15] = step1b[15][i] + step1b[12][i]; - vsub.s16 q14, q3, q4 - vadd.s16 q10, q3, q4 - vadd.s16 q15, q2, q6 - STORE_IN_OUTPUT 26, 8, 15, q8, q15 - STORE_IN_OUTPUT 15, 9, 14, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; - ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; - ;step3[10] = dct_const_round_shift(temp1); - ;step3[13] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 14, 13, 10, q3, q1 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[11] = step1b[8][i] - step1b[11][i]; - ;step2[12] = step1b[15][i] - step1b[12][i]; - vsub.s16 q13, q0, q5 - vsub.s16 q14, q2, q6 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; - ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; - ;step3[11] = dct_const_round_shift(temp1); - ;step3[12] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 10, 11, 12, q1, q3 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK D: 0-3,4-7 - ; -------------------------------------------------------------------------- - ; generate 4,5,6,7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; - ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; - ;step3[4] = dct_const_round_shift(temp1); - ;step3[7] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 6, 4, 28 - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; - ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; - ;step3[5] = dct_const_round_shift(temp1); - ;step3[6] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 28, 20, 12 - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[4] = step1b[4][i] + step1b[5][i]; - ;step1[5] = step1b[4][i] - step1b[5][i]; - ;step1[6] = step1b[7][i] - step1b[6][i]; - ;step1[7] = step1b[7][i] + step1b[6][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; - ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; - ;step2[5] = dct_const_round_shift(temp1); - ;step2[6] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 0,1,2,3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; - ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; - ;step1[1] = dct_const_round_shift(temp1); - ;step1[0] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 12, 0, 16 - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; - ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; - ;step1[2] = dct_const_round_shift(temp1); - ;step1[3] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 16, 8, 24 - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[0] = step1b[0][i] + step1b[3][i]; - ;step2[1] = step1b[1][i] + step1b[2][i]; - ;step2[2] = step1b[1][i] - step1b[2][i]; - ;step2[3] = step1b[0][i] - step1b[3][i]; - vadd.s16 q4, q7, q6 - vsub.s16 q7, q7, q6 - vsub.s16 q6, q5, q14 - vadd.s16 q5, q5, q14 - ; -------------------------------------------------------------------------- - ; combine 0-3,4-7 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[0] = step1b[0][i] + step1b[7][i]; - ;step3[1] = step1b[1][i] + step1b[6][i]; - ;step3[2] = step1b[2][i] + step1b[5][i]; - ;step3[3] = step1b[3][i] + step1b[4][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q3 - vadd.s16 q10, q6, q1 - vadd.s16 q11, q7, q0 - ;step3[4] = step1b[3][i] - step1b[4][i]; - ;step3[5] = step1b[2][i] - step1b[5][i]; - ;step3[6] = step1b[1][i] - step1b[6][i]; - ;step3[7] = step1b[0][i] - step1b[7][i]; - vsub.s16 q12, q7, q0 - vsub.s16 q13, q6, q1 - vsub.s16 q14, q5, q3 - vsub.s16 q15, q4, q2 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[0] = step1b[0][i] + step1b[15][i]; - ;step1[1] = step1b[1][i] + step1b[14][i]; - ;step1[14] = step1b[1][i] - step1b[14][i]; - ;step1[15] = step1b[0][i] - step1b[15][i]; - LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 - vadd.s16 q2, q8, q1 - vadd.s16 q3, q9, q0 - vsub.s16 q4, q9, q0 - vsub.s16 q5, q8, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[14 * 32] = step1b[14][i] + step1b[17][i]; - ;output[15 * 32] = step1b[15][i] + step1b[16][i]; - ;output[16 * 32] = step1b[15][i] - step1b[16][i]; - ;output[17 * 32] = step1b[14][i] - step1b[17][i]; - LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - - cmp r5, #0 - bgt idct32_bands_end_2nd_pass - -idct32_bands_end_1st_pass - STORE_IN_OUTPUT 17, 16, 17, q6, q7 - STORE_IN_OUTPUT 17, 14, 15, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 31, 30, 31, q6, q7 - STORE_IN_OUTPUT 31, 0, 1, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 19, 18, 19, q6, q7 - STORE_IN_OUTPUT 19, 12, 13, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 29, 28, 29, q6, q7 - STORE_IN_OUTPUT 29, 2, 3, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 21, 20, 21, q6, q7 - STORE_IN_OUTPUT 21, 10, 11, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 27, 26, 27, q6, q7 - STORE_IN_OUTPUT 27, 4, 5, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 23, 22, 23, q6, q7 - STORE_IN_OUTPUT 23, 8, 9, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 25, 24, 25, q6, q7 - STORE_IN_OUTPUT 25, 6, 7, q4, q5 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #7*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; parameters for second pass - ; the input of pass2 is the result of pass1. we have to remove the offset - ; of 32 columns induced by the above idct32_bands_loop - sub r3, r1, #32*2 - ; r1 = pass2[32 * 32] - add r1, sp, #2048 - - ; pass loop processing - add r5, r5, #1 - b idct32_pass_loop - -idct32_bands_end_2nd_pass - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; restore pointers to their initial indices for next band pass by - ; removing/adding dest_stride * 8. The actual increment by eight - ; is taken care of within the _LAST macros. - add r6, r6, r2, lsl #3 - add r9, r9, r2, lsl #3 - sub r7, r7, r2, lsl #3 - sub r10, r10, r2, lsl #3 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #25*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; stack operation - add sp, sp, #512+2048+2048 - vpop {d8-d15} - pop {r4-r11} - bx lr - ENDP ; |aom_idct32x32_1024_add_neon| - END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c deleted file mode 100644 index 3df7a901b..000000000 --- a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "aom_dsp/inv_txfm.h" -#include "aom_ports/mem.h" - -void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d6u8; - uint32x2_t d2u32 = vdup_n_u32(0); - uint16x8_t q8u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 4); - - q0s16 = vdupq_n_s16(a1); - - // dc_only_idct_add - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); - d1 += dest_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32)); - d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); - d2 += dest_stride; - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); - d2 += dest_stride; - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm deleted file mode 100644 index 6bd733d5d..000000000 --- a/third_party/aom/aom_dsp/arm/idct4x4_1_add_neon_asm.asm +++ /dev/null @@ -1,71 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - - EXPORT |aom_idct4x4_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct4x4_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 4) - add r0, r0, #8 ; + (1 <<((4) - 1)) - asr r0, r0, #4 ; >> 4 - - vdup.s16 q0, r0 ; duplicate a1 - - vld1.32 {d2[0]}, [r1], r2 - vld1.32 {d2[1]}, [r1], r2 - vld1.32 {d4[0]}, [r1], r2 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q8, q0, d2 ; dest[x] + a1 - vaddw.u8 q9, q0, d4 - - vqmovun.s16 d6, q8 ; clip_pixel - vqmovun.s16 d7, q9 - - vst1.32 {d6[0]}, [r12], r2 - vst1.32 {d6[1]}, [r12], r2 - vst1.32 {d7[0]}, [r12], r2 - vst1.32 {d7[1]}, [r12] - - bx lr - ENDP ; |aom_idct4x4_1_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c b/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c deleted file mode 100644 index 763be1ab0..000000000 --- a/third_party/aom/aom_dsp/arm/idct4x4_add_neon.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "aom_dsp/txfm_common.h" - -void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d26u8, d27u8; - uint32x2_t d26u32, d27u32; - uint16x8_t q8u16, q9u16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; - int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; - int16x8_t q8s16, q9s16, q13s16, q14s16; - int32x4_t q1s32, q13s32, q14s32, q15s32; - int16x4x2_t d0x2s16, d1x2s16; - int32x4x2_t q0x2s32; - uint8_t *d; - - d26u32 = d27u32 = vdup_n_u32(0); - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - d20s16 = vdup_n_s16((int16_t)cospi_8_64); - d21s16 = vdup_n_s16((int16_t)cospi_16_64); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - d22s16 = vdup_n_s16((int16_t)cospi_24_64); - - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_high_s16(q9s16); // vswp d18 d19 - d19s16 = vget_low_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - // do the transform on columns - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - q8s16 = vrshrq_n_s16(q8s16, 4); - q9s16 = vrshrq_n_s16(q9s16, 4); - - d = dest; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); - d += dest_stride; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); - - d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - - d = dest; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm deleted file mode 100644 index 127acf614..000000000 --- a/third_party/aom/aom_dsp/arm/idct4x4_add_neon_asm.asm +++ /dev/null @@ -1,193 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_idct4x4_16_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - AREA Block, CODE, READONLY ; name this block of code -;void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct4x4_16_add_neon| PROC - - ; The 2D transform is done with two passes which are actually pretty - ; similar. We first transform the rows. This is done by transposing - ; the inputs, doing an SIMD column transform (the columns are the - ; transposed rows) and then transpose the results (so that it goes back - ; in normal/row positions). Then, we transform the columns by doing - ; another SIMD column transform. - ; So, two passes of a transpose followed by a column transform. - - ; load the inputs into q8-q9, d16-d19 - vld1.s16 {q8,q9}, [r0]! - - ; generate scalar constants - ; cospi_8_64 = 15137 = 0x3b21 - mov r0, #0x3b00 - add r0, #0x21 - ; cospi_16_64 = 11585 = 0x2d41 - mov r3, #0x2d00 - add r3, #0x41 - ; cospi_24_64 = 6270 = 0x 187e - mov r12, #0x1800 - add r12, #0x7e - - ; transpose the input data - ; 00 01 02 03 d16 - ; 10 11 12 13 d17 - ; 20 21 22 23 d18 - ; 30 31 32 33 d19 - vtrn.16 d16, d17 - vtrn.16 d18, d19 - - ; generate constant vectors - vdup.16 d20, r0 ; replicate cospi_8_64 - vdup.16 d21, r3 ; replicate cospi_16_64 - - ; 00 10 02 12 d16 - ; 01 11 03 13 d17 - ; 20 30 22 32 d18 - ; 21 31 23 33 d19 - vtrn.32 q8, q9 - ; 00 10 20 30 d16 - ; 01 11 21 31 d17 - ; 02 12 22 32 d18 - ; 03 13 23 33 d19 - - vdup.16 d22, r12 ; replicate cospi_24_64 - - ; do the transform on transposed rows - - ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 - vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 - - ; (input[0] + input[2]) * cospi_16_64; - ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64; - ; input[1] * cospi_8_64 + input[3] * cospi_24_64; - vmlsl.s16 q15, d19, d20 - vmlal.s16 q1, d19, d22 - - ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 - - ; stage 2 - ; output[0] = step[0] + step[3]; - ; output[1] = step[1] + step[2]; - ; output[3] = step[0] - step[3]; - ; output[2] = step[1] - step[2]; - vadd.s16 q8, q13, q14 - vsub.s16 q9, q13, q14 - vswp d18, d19 - - ; transpose the results - ; 00 01 02 03 d16 - ; 10 11 12 13 d17 - ; 20 21 22 23 d18 - ; 30 31 32 33 d19 - vtrn.16 d16, d17 - vtrn.16 d18, d19 - ; 00 10 02 12 d16 - ; 01 11 03 13 d17 - ; 20 30 22 32 d18 - ; 21 31 23 33 d19 - vtrn.32 q8, q9 - ; 00 10 20 30 d16 - ; 01 11 21 31 d17 - ; 02 12 22 32 d18 - ; 03 13 23 33 d19 - - ; do the transform on columns - - ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 - vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 - - ; (input[0] + input[2]) * cospi_16_64; - ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64; - ; input[1] * cospi_8_64 + input[3] * cospi_24_64; - vmlsl.s16 q15, d19, d20 - vmlal.s16 q1, d19, d22 - - ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 - - ; stage 2 - ; output[0] = step[0] + step[3]; - ; output[1] = step[1] + step[2]; - ; output[3] = step[0] - step[3]; - ; output[2] = step[1] - step[2]; - vadd.s16 q8, q13, q14 - vsub.s16 q9, q13, q14 - - ; The results are in two registers, one of them being swapped. This will - ; be taken care of by loading the 'dest' value in a swapped fashion and - ; also storing them in the same swapped fashion. - ; temp_out[0, 1] = d16, d17 = q8 - ; temp_out[2, 3] = d19, d18 = q9 swapped - - ; ROUND_POWER_OF_TWO(temp_out[j], 4) - vrshr.s16 q8, q8, #4 - vrshr.s16 q9, q9, #4 - - vld1.32 {d26[0]}, [r1], r2 - vld1.32 {d26[1]}, [r1], r2 - vld1.32 {d27[1]}, [r1], r2 - vld1.32 {d27[0]}, [r1] ; no post-increment - - ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d26 - vaddw.u8 q9, q9, d27 - - ; clip_pixel - vqmovun.s16 d26, q8 - vqmovun.s16 d27, q9 - - ; do the stores in reverse order with negative post-increment, by changing - ; the sign of the stride - rsb r2, r2, #0 - vst1.32 {d27[0]}, [r1], r2 - vst1.32 {d27[1]}, [r1], r2 - vst1.32 {d26[1]}, [r1], r2 - vst1.32 {d26[0]}, [r1] ; no post-increment - bx lr - ENDP ; |aom_idct4x4_16_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c deleted file mode 100644 index c7926f9e4..000000000 --- a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "aom_dsp/inv_txfm.h" -#include "aom_ports/mem.h" - -void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 5); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d5u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); - - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm deleted file mode 100644 index ec07e2053..000000000 --- a/third_party/aom/aom_dsp/arm/idct8x8_1_add_neon_asm.asm +++ /dev/null @@ -1,91 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - - - - EXPORT |aom_idct8x8_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct8x8_1_add_neon| PROC - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 5) - add r0, r0, #16 ; + (1 <<((5) - 1)) - asr r0, r0, #5 ; >> 5 - - vdup.s16 q0, r0 ; duplicate a1 - - ; load destination data - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r2 - vld1.64 {d17}, [r1] - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |aom_idct8x8_1_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c b/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c deleted file mode 100644 index 8ad70862d..000000000 --- a/third_party/aom/aom_dsp/arm/idct8x8_add_neon.c +++ /dev/null @@ -1,509 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_config.h" -#include "aom_dsp/txfm_common.h" - -static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; -} - -static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - - d0s16 = vdup_n_s16((int16_t)cospi_28_64); - d1s16 = vdup_n_s16((int16_t)cospi_4_64); - d2s16 = vdup_n_s16((int16_t)cospi_12_64); - d3s16 = vdup_n_s16((int16_t)cospi_20_64); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d26s16, d2s16); - q6s32 = vmull_s16(d27s16, d2s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); - q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d10s16 = vqrshrn_n_s32(q5s32, 14); - d11s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q2s32 = vmull_s16(d18s16, d1s16); - q3s32 = vmull_s16(d19s16, d1s16); - q9s32 = vmull_s16(d26s16, d3s16); - q13s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlal_s16(q2s32, d30s16, d0s16); - q3s32 = vmlal_s16(q3s32, d31s16, d0s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - - d14s16 = vqrshrn_n_s32(q2s32, 14); - d15s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q13s32, 14); - q6s16 = vcombine_s16(d12s16, d13s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d0s16 = vdup_n_s16((int16_t)cospi_16_64); - - q2s32 = vmull_s16(d16s16, d0s16); - q3s32 = vmull_s16(d17s16, d0s16); - q13s32 = vmull_s16(d16s16, d0s16); - q15s32 = vmull_s16(d17s16, d0s16); - - q2s32 = vmlal_s16(q2s32, d24s16, d0s16); - q3s32 = vmlal_s16(q3s32, d25s16, d0s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); - q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - - d0s16 = vdup_n_s16((int16_t)cospi_24_64); - d1s16 = vdup_n_s16((int16_t)cospi_8_64); - - d18s16 = vqrshrn_n_s32(q2s32, 14); - d19s16 = vqrshrn_n_s32(q3s32, 14); - d22s16 = vqrshrn_n_s32(q13s32, 14); - d23s16 = vqrshrn_n_s32(q15s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q2s32 = vmull_s16(d20s16, d0s16); - q3s32 = vmull_s16(d21s16, d0s16); - q8s32 = vmull_s16(d20s16, d1s16); - q12s32 = vmull_s16(d21s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); - q8s32 = vmlal_s16(q8s32, d28s16, d0s16); - q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - - d26s16 = vqrshrn_n_s32(q2s32, 14); - d27s16 = vqrshrn_n_s32(q3s32, 14); - d30s16 = vqrshrn_n_s32(q8s32, 14); - d31s16 = vqrshrn_n_s32(q12s32, 14); - *q13s16 = vcombine_s16(d26s16, d27s16); - *q15s16 = vcombine_s16(d30s16, d31s16); - - q0s16 = vaddq_s16(*q9s16, *q15s16); - q1s16 = vaddq_s16(*q11s16, *q13s16); - q2s16 = vsubq_s16(*q11s16, *q13s16); - q3s16 = vsubq_s16(*q9s16, *q15s16); - - *q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - *q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - - d16s16 = vdup_n_s16((int16_t)cospi_16_64); - - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - *q8s16 = vaddq_s16(q0s16, q7s16); - *q9s16 = vaddq_s16(q1s16, q6s16); - *q10s16 = vaddq_s16(q2s16, q5s16); - *q11s16 = vaddq_s16(q3s16, q4s16); - *q12s16 = vsubq_s16(q3s16, q4s16); - *q13s16 = vsubq_s16(q2s16, q5s16); - *q14s16 = vsubq_s16(q1s16, q6s16); - *q15s16 = vsubq_s16(q0s16, q7s16); - return; -} - -void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - return; -} - -void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; - int16x4_t d26s16, d27s16, d28s16, d29s16; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - int32x4_t q9s32, q10s32, q11s32, q12s32; - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // First transform rows - // stage 1 - q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2); - q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - - q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2); - - q7s16 = vqrdmulhq_s16(q9s16, q1s16); - - q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2); - - q5s16 = vqrdmulhq_s16(q11s16, q0s16); - - q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2); - - q6s16 = vqrdmulhq_s16(q11s16, q1s16); - - // stage 2 & stage 3 - even half - q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2); - - q9s16 = vqrdmulhq_s16(q8s16, q0s16); - - q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2); - - q13s16 = vqrdmulhq_s16(q10s16, q1s16); - - q15s16 = vqrdmulhq_s16(q10s16, q0s16); - - // stage 3 -odd half - q0s16 = vaddq_s16(q9s16, q15s16); - q1s16 = vaddq_s16(q9s16, q13s16); - q2s16 = vsubq_s16(q9s16, q13s16); - q3s16 = vsubq_s16(q9s16, q15s16); - - // stage 2 - odd half - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d16s16 = vdup_n_s16((int16_t)cospi_16_64); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - q8s16 = vaddq_s16(q0s16, q7s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q7s16); - - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - return; -} diff --git a/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm b/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm deleted file mode 100644 index f3d5f246d..000000000 --- a/third_party/aom/aom_dsp/arm/idct8x8_add_neon_asm.asm +++ /dev/null @@ -1,522 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_idct8x8_64_add_neon| - EXPORT |aom_idct8x8_12_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are - ; loaded in q8-q15. The output will be stored back into q8-q15 registers. - ; This macro will touch q0-q7 registers and use them as buffer during - ; calculation. - MACRO - IDCT8x8_1D - ; stage 1 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r4 ; duplicate cospi_4_64 - vdup.16 d2, r5 ; duplicate cospi_12_64 - vdup.16 d3, r6 ; duplicate cospi_20_64 - - ; input[1] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; input[5] * cospi_12_64 - vmull.s16 q5, d26, d2 - vmull.s16 q6, d27, d2 - - ; input[1]*cospi_28_64-input[7]*cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q5, d22, d3 - vmlsl.s16 q6, d23, d3 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q5, #14 ; >> 14 - vqrshrn.s32 d11, q6, #14 ; >> 14 - - ; input[1] * cospi_4_64 - vmull.s16 q2, d18, d1 - vmull.s16 q3, d19, d1 - - ; input[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q13, d27, d3 - - ; input[1]*cospi_4_64+input[7]*cospi_28_64 - vmlal.s16 q2, d30, d0 - vmlal.s16 q3, d31, d0 - - ; input[5] * cospi_20_64 + input[3] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q13, d23, d2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d14, q2, #14 ; >> 14 - vqrshrn.s32 d15, q3, #14 ; >> 14 - - ; stage 2 & stage 3 - even half - vdup.16 d0, r7 ; duplicate cospi_16_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q13, #14 ; >> 14 - - ; input[0] * cospi_16_64 - vmull.s16 q2, d16, d0 - vmull.s16 q3, d17, d0 - - ; input[0] * cospi_16_64 - vmull.s16 q13, d16, d0 - vmull.s16 q15, d17, d0 - - ; (input[0] + input[2]) * cospi_16_64 - vmlal.s16 q2, d24, d0 - vmlal.s16 q3, d25, d0 - - ; (input[0] - input[2]) * cospi_16_64 - vmlsl.s16 q13, d24, d0 - vmlsl.s16 q15, d25, d0 - - vdup.16 d0, r8 ; duplicate cospi_24_64 - vdup.16 d1, r9 ; duplicate cospi_8_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d18, q2, #14 ; >> 14 - vqrshrn.s32 d19, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d22, q13, #14 ; >> 14 - vqrshrn.s32 d23, q15, #14 ; >> 14 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - ; input[1] * cospi_24_64 - vmull.s16 q2, d20, d0 - vmull.s16 q3, d21, d0 - - ; input[1] * cospi_8_64 - vmull.s16 q8, d20, d1 - vmull.s16 q12, d21, d1 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q2, d28, d1 - vmlsl.s16 q3, d29, d1 - - ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q8, d28, d0 - vmlal.s16 q12, d29, d0 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d26, q2, #14 ; >> 14 - vqrshrn.s32 d27, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d30, q8, #14 ; >> 14 - vqrshrn.s32 d31, q12, #14 ; >> 14 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - MEND - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void aom_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct8x8_64_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - ; transpose the input data - TRANSPOSE8X8 - - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - - ; First transform rows - IDCT8x8_1D - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |aom_idct8x8_64_add_neon| - -;void aom_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride) - -|aom_idct8x8_12_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! - - ; transpose the input data - TRANSPOSE8X8 - - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c - - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 - - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 - - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e - - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 - - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e - - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 - - ; First transform rows - ; stage 1 - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling - ; multiply and shift the result by 16 bits instead of 14 bits. So we need - ; to double the constants before multiplying to compensate this. - mov r12, r3, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_28_64*2 - mov r12, r4, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; dct_const_round_shift(input[1] * cospi_28_64) - vqrdmulh.s16 q4, q9, q0 - - mov r12, r6, lsl #1 - rsb r12, #0 - vdup.16 q0, r12 ; duplicate -cospi_20_64*2 - - ; dct_const_round_shift(input[1] * cospi_4_64) - vqrdmulh.s16 q7, q9, q1 - - mov r12, r5, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_12_64*2 - - ; dct_const_round_shift(- input[3] * cospi_20_64) - vqrdmulh.s16 q5, q11, q0 - - mov r12, r7, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_16_64*2 - - ; dct_const_round_shift(input[3] * cospi_12_64) - vqrdmulh.s16 q6, q11, q1 - - ; stage 2 & stage 3 - even half - mov r12, r8, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_24_64*2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrdmulh.s16 q9, q8, q0 - - mov r12, r9, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_8_64*2 - - ; dct_const_round_shift(input[1] * cospi_24_64) - vqrdmulh.s16 q13, q10, q1 - - ; dct_const_round_shift(input[1] * cospi_8_64) - vqrdmulh.s16 q15, q10, q0 - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |aom_idct8x8_12_add_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c index 7d5f64004..69470eeb0 100644 --- a/third_party/aom/aom_dsp/arm/intrapred_neon.c +++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c @@ -11,8 +11,9 @@ #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" //------------------------------------------------------------------------------ @@ -342,8 +343,6 @@ void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); } -#if !HAVE_NEON_ASM - void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int i; @@ -529,4 +528,3 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, } } } -#endif // !HAVE_NEON_ASM diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm b/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm deleted file mode 100644 index fba9c1b5b..000000000 --- a/third_party/aom/aom_dsp/arm/intrapred_neon_asm.asm +++ /dev/null @@ -1,287 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_v_predictor_4x4_neon| - EXPORT |aom_v_predictor_8x8_neon| - EXPORT |aom_v_predictor_16x16_neon| - EXPORT |aom_v_predictor_32x32_neon| - EXPORT |aom_h_predictor_4x4_neon| - EXPORT |aom_h_predictor_8x8_neon| - EXPORT |aom_h_predictor_16x16_neon| - EXPORT |aom_h_predictor_32x32_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_v_predictor_4x4_neon| PROC - vld1.32 {d0[0]}, [r2] - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - bx lr - ENDP ; |aom_v_predictor_4x4_neon| - -;void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_v_predictor_8x8_neon| PROC - vld1.8 {d0}, [r2] - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - bx lr - ENDP ; |aom_v_predictor_8x8_neon| - -;void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_v_predictor_16x16_neon| PROC - vld1.8 {q0}, [r2] - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - bx lr - ENDP ; |aom_v_predictor_16x16_neon| - -;void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_v_predictor_32x32_neon| PROC - vld1.8 {q0, q1}, [r2] - mov r2, #2 -loop_v - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - subs r2, r2, #1 - bgt loop_v - bx lr - ENDP ; |aom_v_predictor_32x32_neon| - -;void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_h_predictor_4x4_neon| PROC - vld1.32 {d1[0]}, [r3] - vdup.8 d0, d1[0] - vst1.32 {d0[0]}, [r0], r1 - vdup.8 d0, d1[1] - vst1.32 {d0[0]}, [r0], r1 - vdup.8 d0, d1[2] - vst1.32 {d0[0]}, [r0], r1 - vdup.8 d0, d1[3] - vst1.32 {d0[0]}, [r0], r1 - bx lr - ENDP ; |aom_h_predictor_4x4_neon| - -;void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_h_predictor_8x8_neon| PROC - vld1.64 {d1}, [r3] - vdup.8 d0, d1[0] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[1] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[2] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[3] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[4] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[5] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[6] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[7] - vst1.64 {d0}, [r0], r1 - bx lr - ENDP ; |aom_h_predictor_8x8_neon| - -;void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_h_predictor_16x16_neon| PROC - vld1.8 {q1}, [r3] - vdup.8 q0, d2[0] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[1] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[2] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[3] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[4] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[5] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[6] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[7] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[0] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[1] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[2] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[3] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[4] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[5] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[6] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[7] - vst1.8 {q0}, [r0], r1 - bx lr - ENDP ; |aom_h_predictor_16x16_neon| - -;void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|aom_h_predictor_32x32_neon| PROC - sub r1, r1, #16 - mov r2, #2 -loop_h - vld1.8 {q1}, [r3]! - vdup.8 q0, d2[0] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[1] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[2] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[3] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[4] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[5] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[6] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[7] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[0] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[1] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[2] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[3] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[4] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[5] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[6] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[7] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - subs r2, r2, #1 - bgt loop_h - bx lr - ENDP ; |aom_h_predictor_32x32_neon| diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c deleted file mode 100644 index c0562a6ea..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_16_neon.c +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" -#include "aom/aom_integer.h" - -static INLINE void loop_filter_neon_16(uint8x16_t qblimit, // blimit - uint8x16_t qlimit, // limit - uint8x16_t qthresh, // thresh - uint8x16_t q3, // p3 - uint8x16_t q4, // p2 - uint8x16_t q5, // p1 - uint8x16_t q6, // p0 - uint8x16_t q7, // q0 - uint8x16_t q8, // q1 - uint8x16_t q9, // q2 - uint8x16_t q10, // q3 - uint8x16_t *q5r, // p1 - uint8x16_t *q6r, // p0 - uint8x16_t *q7r, // q0 - uint8x16_t *q8r) { // q1 - uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int16x8_t q2s16, q11s16; - uint16x8_t q4u16; - int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8; - int8x8_t d2s8, d3s8; - - q11u8 = vabdq_u8(q3, q4); - q12u8 = vabdq_u8(q4, q5); - q13u8 = vabdq_u8(q5, q6); - q14u8 = vabdq_u8(q8, q7); - q3 = vabdq_u8(q9, q8); - q4 = vabdq_u8(q10, q9); - - q11u8 = vmaxq_u8(q11u8, q12u8); - q12u8 = vmaxq_u8(q13u8, q14u8); - q3 = vmaxq_u8(q3, q4); - q15u8 = vmaxq_u8(q11u8, q12u8); - - q9 = vabdq_u8(q6, q7); - - // aom_hevmask - q13u8 = vcgtq_u8(q13u8, qthresh); - q14u8 = vcgtq_u8(q14u8, qthresh); - q15u8 = vmaxq_u8(q15u8, q3); - - q2u8 = vabdq_u8(q5, q8); - q9 = vqaddq_u8(q9, q9); - - q15u8 = vcgeq_u8(qlimit, q15u8); - - // aom_filter() function - // convert to signed - q10 = vdupq_n_u8(0x80); - q8 = veorq_u8(q8, q10); - q7 = veorq_u8(q7, q10); - q6 = veorq_u8(q6, q10); - q5 = veorq_u8(q5, q10); - - q2u8 = vshrq_n_u8(q2u8, 1); - q9 = vqaddq_u8(q9, q2u8); - - q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), - vget_low_s8(vreinterpretq_s8_u8(q6))); - q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), - vget_high_s8(vreinterpretq_s8_u8(q6))); - - q9 = vcgeq_u8(qblimit, q9); - - q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8)); - - q14u8 = vorrq_u8(q13u8, q14u8); - - q4u16 = vdupq_n_u16(3); - q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); - q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); - - q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); - q15u8 = vandq_u8(q15u8, q9); - - q1s8 = vreinterpretq_s8_u8(q1u8); - q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); - q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); - - q4 = vdupq_n_u8(3); - q9 = vdupq_n_u8(4); - // aom_filter = clamp(aom_filter + 3 * ( qs0 - ps0)) - d2s8 = vqmovn_s16(q2s16); - d3s8 = vqmovn_s16(q11s16); - q1s8 = vcombine_s8(d2s8, d3s8); - q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); - q1s8 = vreinterpretq_s8_u8(q1u8); - - q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4)); - q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); - q2s8 = vshrq_n_s8(q2s8, 3); - q1s8 = vshrq_n_s8(q1s8, 3); - - q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); - q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); - - q1s8 = vrshrq_n_s8(q1s8, 1); - q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); - - q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); - q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); - - *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10); - *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8), q10); - *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10); - *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10); - return; -} - -void aom_lpf_horizontal_4_dual_neon( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; - uint8x16_t qblimit, qlimit, qthresh; - uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; - - dblimit0 = vld1_u8(blimit0); - dlimit0 = vld1_u8(limit0); - dthresh0 = vld1_u8(thresh0); - dblimit1 = vld1_u8(blimit1); - dlimit1 = vld1_u8(limit1); - dthresh1 = vld1_u8(thresh1); - qblimit = vcombine_u8(dblimit0, dblimit1); - qlimit = vcombine_u8(dlimit0, dlimit1); - qthresh = vcombine_u8(dthresh0, dthresh1); - - s -= (p << 2); - - q3u8 = vld1q_u8(s); - s += p; - q4u8 = vld1q_u8(s); - s += p; - q5u8 = vld1q_u8(s); - s += p; - q6u8 = vld1q_u8(s); - s += p; - q7u8 = vld1q_u8(s); - s += p; - q8u8 = vld1q_u8(s); - s += p; - q9u8 = vld1q_u8(s); - s += p; - q10u8 = vld1q_u8(s); - - loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8, - q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8); - - s -= (p * 5); - vst1q_u8(s, q5u8); - s += p; - vst1q_u8(s, q6u8); - s += p; - vst1q_u8(s, q7u8); - s += p; - vst1q_u8(s, q8u8); - return; -} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm deleted file mode 100644 index b6e2c9edb..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_16_neon_asm.asm +++ /dev/null @@ -1,202 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_4_dual_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void aom_lpf_horizontal_4_dual_neon(uint8_t *s, int p, -; const uint8_t *blimit0, -; const uint8_t *limit0, -; const uint8_t *thresh0, -; const uint8_t *blimit1, -; const uint8_t *limit1, -; const uint8_t *thresh1) -; r0 uint8_t *s, -; r1 int p, -; r2 const uint8_t *blimit0, -; r3 const uint8_t *limit0, -; sp const uint8_t *thresh0, -; sp+4 const uint8_t *blimit1, -; sp+8 const uint8_t *limit1, -; sp+12 const uint8_t *thresh1, - -|aom_lpf_horizontal_4_dual_neon| PROC - push {lr} - - ldr r12, [sp, #4] ; load thresh0 - vld1.8 {d0}, [r2] ; load blimit0 to first half q - vld1.8 {d2}, [r3] ; load limit0 to first half q - - add r1, r1, r1 ; double pitch - ldr r2, [sp, #8] ; load blimit1 - - vld1.8 {d4}, [r12] ; load thresh0 to first half q - - ldr r3, [sp, #12] ; load limit1 - ldr r12, [sp, #16] ; load thresh1 - vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q - - sub r2, r0, r1, lsl #1 ; s[-4 * p] - - vld1.8 {d3}, [r3] ; load limit1 to 2nd half q - vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q - - vpush {d8-d15} ; save neon registers - - add r3, r2, r1, lsr #1 ; s[-3 * p] - - vld1.u8 {q3}, [r2@64], r1 ; p3 - vld1.u8 {q4}, [r3@64], r1 ; p2 - vld1.u8 {q5}, [r2@64], r1 ; p1 - vld1.u8 {q6}, [r3@64], r1 ; p0 - vld1.u8 {q7}, [r2@64], r1 ; q0 - vld1.u8 {q8}, [r3@64], r1 ; q1 - vld1.u8 {q9}, [r2@64] ; q2 - vld1.u8 {q10}, [r3@64] ; q3 - - sub r2, r2, r1, lsl #1 - sub r3, r3, r1, lsl #1 - - bl aom_loop_filter_neon_16 - - vst1.u8 {q5}, [r2@64], r1 ; store op1 - vst1.u8 {q6}, [r3@64], r1 ; store op0 - vst1.u8 {q7}, [r2@64], r1 ; store oq0 - vst1.u8 {q8}, [r3@64], r1 ; store oq1 - - vpop {d8-d15} ; restore neon registers - - pop {pc} - ENDP ; |aom_lpf_horizontal_4_dual_neon| - -; void aom_loop_filter_neon_16(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. This function uses -; registers d8-d15, so the calling function must save those registers. -; -; r0-r3, r12 PRESERVE -; q0 blimit -; q1 limit -; q2 thresh -; q3 p3 -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 -; -; Outputs: -; q5 op1 -; q6 op0 -; q7 oq0 -; q8 oq1 -|aom_loop_filter_neon_16| PROC - - ; filter_mask - vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) - vabd.u8 q12, q4, q5 ; m2 = abs(p2 - p1) - vabd.u8 q13, q5, q6 ; m3 = abs(p1 - p0) - vabd.u8 q14, q8, q7 ; m4 = abs(q1 - q0) - vabd.u8 q3, q9, q8 ; m5 = abs(q2 - q1) - vabd.u8 q4, q10, q9 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 q11, q11, q12 ; m7 = max(m1, m2) - vmax.u8 q12, q13, q14 ; m8 = max(m3, m4) - - vabd.u8 q9, q6, q7 ; abs(p0 - q0) - - vmax.u8 q3, q3, q4 ; m9 = max(m5, m6) - - vmov.u8 q10, #0x80 - - vmax.u8 q15, q11, q12 ; m10 = max(m7, m8) - - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 q15, q15, q3 ; m11 = max(m10, m9) - - vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - - veor q7, q7, q10 ; qs0 - - vcge.u8 q15, q1, q15 ; abs(m11) > limit - - vshr.u8 q2, q2, #1 ; a = a / 2 - veor q6, q6, q10 ; ps0 - - veor q5, q5, q10 ; ps1 - vqadd.u8 q9, q9, q2 ; a = b + a - - veor q8, q8, q10 ; qs1 - - vmov.u16 q4, #3 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q11, d15, d13 - - vcge.u8 q9, q0, q9 ; a > blimit - - vqsub.s8 q1, q5, q8 ; filter = clamp(ps1-qs1) - vorr q14, q13, q14 ; hev - - vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) - vmul.i16 q11, q11, q4 - - vand q1, q1, q14 ; filter &= hev - vand q15, q15, q9 ; mask - - vmov.u8 q4, #3 - - vaddw.s8 q2, q2, d2 ; filter + 3 * (qs0 - ps0) - vaddw.s8 q11, q11, d3 - - vmov.u8 q9, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q11 - vand q1, q1, q15 ; filter &= mask - - vqadd.s8 q2, q1, q4 ; filter2 = clamp(filter+3) - vqadd.s8 q1, q1, q9 ; filter1 = clamp(filter+4) - vshr.s8 q2, q2, #3 ; filter2 >>= 3 - vshr.s8 q1, q1, #3 ; filter1 >>= 3 - - - vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + filter2) - vqsub.s8 q0, q7, q1 ; u = clamp(qs0 - filter1) - - ; outer tap adjustments - vrshr.s8 q1, q1, #1 ; filter = ++filter1 >> 1 - - veor q7, q0, q10 ; *oq0 = u^0x80 - - vbic q1, q1, q14 ; filter &= ~hev - - vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + filter) - vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - filter) - - veor q6, q11, q10 ; *op0 = u^0x80 - veor q5, q13, q10 ; *op1 = u^0x80 - veor q8, q12, q10 ; *oq1 = u^0x80 - - bx lr - ENDP ; |aom_loop_filter_neon_16| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c deleted file mode 100644 index 2b1f80b81..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_4_neon.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_dsp_rtcd.h" - -static INLINE void loop_filter_neon(uint8x8_t dblimit, // flimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p3 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d4ru8, // p1 - uint8x8_t *d5ru8, // p0 - uint8x8_t *d6ru8, // q0 - uint8x8_t *d7ru8) { // q1 - uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; - int16x8_t q12s16; - int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d3u8 = vabd_u8(d17u8, d16u8); - d4u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - d3u8 = vmax_u8(d3u8, d4u8); - d23u8 = vmax_u8(d19u8, d20u8); - - d17u8 = vabd_u8(d6u8, d7u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - d22u8 = vcgt_u8(d22u8, dthresh); - d23u8 = vmax_u8(d23u8, d3u8); - - d28u8 = vabd_u8(d5u8, d16u8); - d17u8 = vqadd_u8(d17u8, d17u8); - - d23u8 = vcge_u8(dlimit, d23u8); - - d18u8 = vdup_n_u8(0x80); - d5u8 = veor_u8(d5u8, d18u8); - d6u8 = veor_u8(d6u8, d18u8); - d7u8 = veor_u8(d7u8, d18u8); - d16u8 = veor_u8(d16u8, d18u8); - - d28u8 = vshr_n_u8(d28u8, 1); - d17u8 = vqadd_u8(d17u8, d28u8); - - d19u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), vreinterpret_s8_u8(d6u8)); - - d17u8 = vcge_u8(dblimit, d17u8); - - d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), vreinterpret_s8_u8(d16u8)); - - d22u8 = vorr_u8(d21u8, d22u8); - - q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); - d23u8 = vand_u8(d23u8, d17u8); - - q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); - - d17u8 = vdup_n_u8(4); - - d27s8 = vqmovn_s16(q12s16); - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); - d27s8 = vreinterpret_s8_u8(d27u8); - - d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); - d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); - d28s8 = vshr_n_s8(d28s8, 3); - d27s8 = vshr_n_s8(d27s8, 3); - - d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); - - d27s8 = vrshr_n_s8(d27s8, 1); - d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); - - d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); - d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); - - *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); - *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); - *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); - *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); - return; -} - -void aom_lpf_horizontal_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < 1; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); - - s -= (pitch * 5); - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - s += pitch; - vst1_u8(s, d6u8); - s += pitch; - vst1_u8(s, d7u8); - } - return; -} - -void aom_lpf_vertical_4_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i, pitch8; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - pitch8 = pitch * 8; - for (i = 0; i < 1; i++, src += pitch8) { - s = src - (i + 1) * 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - loop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d4u8, &d5u8, &d6u8, &d7u8); - - d4Result.val[0] = d4u8; - d4Result.val[1] = d5u8; - d4Result.val[2] = d6u8; - d4Result.val[3] = d7u8; - - src -= 2; - vst4_lane_u8(src, d4Result, 0); - src += pitch; - vst4_lane_u8(src, d4Result, 1); - src += pitch; - vst4_lane_u8(src, d4Result, 2); - src += pitch; - vst4_lane_u8(src, d4Result, 3); - src += pitch; - vst4_lane_u8(src, d4Result, 4); - src += pitch; - vst4_lane_u8(src, d4Result, 5); - src += pitch; - vst4_lane_u8(src, d4Result, 6); - src += pitch; - vst4_lane_u8(src, d4Result, 7); - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm deleted file mode 100644 index 8b54984d5..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_4_neon_asm.asm +++ /dev/null @@ -1,252 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_4_neon| - EXPORT |aom_lpf_vertical_4_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; Currently aom only works on iterations 8 at a time. The aom loop filter -; works on 16 iterations at a time. -; -; void aom_lpf_horizontal_4_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_horizontal_4_neon| PROC - push {lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r2, [sp, #4] ; load thresh - add r1, r1, r1 ; double pitch - - vld1.8 {d1[]}, [r3] ; duplicate *limit - vld1.8 {d2[]}, [r2] ; duplicate *thresh - - sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines - add r3, r2, r1, lsr #1 ; set to 3 lines down - - vld1.u8 {d3}, [r2@64], r1 ; p3 - vld1.u8 {d4}, [r3@64], r1 ; p2 - vld1.u8 {d5}, [r2@64], r1 ; p1 - vld1.u8 {d6}, [r3@64], r1 ; p0 - vld1.u8 {d7}, [r2@64], r1 ; q0 - vld1.u8 {d16}, [r3@64], r1 ; q1 - vld1.u8 {d17}, [r2@64] ; q2 - vld1.u8 {d18}, [r3@64] ; q3 - - sub r2, r2, r1, lsl #1 - sub r3, r3, r1, lsl #1 - - bl aom_loop_filter_neon - - vst1.u8 {d4}, [r2@64], r1 ; store op1 - vst1.u8 {d5}, [r3@64], r1 ; store op0 - vst1.u8 {d6}, [r2@64], r1 ; store oq0 - vst1.u8 {d7}, [r3@64], r1 ; store oq1 - - pop {pc} - ENDP ; |aom_lpf_horizontal_4_neon| - -; Currently aom only works on iterations 8 at a time. The aom loop filter -; works on 16 iterations at a time. -; -; void aom_lpf_vertical_4_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_vertical_4_neon| PROC - push {lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - vld1.8 {d1[]}, [r3] ; duplicate *limit - - ldr r3, [sp, #4] ; load thresh - sub r2, r0, #4 ; move s pointer down by 4 columns - - vld1.8 {d2[]}, [r3] ; duplicate *thresh - - vld1.u8 {d3}, [r2], r1 ; load s data - vld1.u8 {d4}, [r2], r1 - vld1.u8 {d5}, [r2], r1 - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d7}, [r2], r1 - vld1.u8 {d16}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d18}, [r2] - - ;transpose to 8x16 matrix - vtrn.32 d3, d7 - vtrn.32 d4, d16 - vtrn.32 d5, d17 - vtrn.32 d6, d18 - - vtrn.16 d3, d5 - vtrn.16 d4, d6 - vtrn.16 d7, d17 - vtrn.16 d16, d18 - - vtrn.8 d3, d4 - vtrn.8 d5, d6 - vtrn.8 d7, d16 - vtrn.8 d17, d18 - - bl aom_loop_filter_neon - - sub r0, r0, #2 - - ;store op1, op0, oq0, oq1 - vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 - vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 - vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 - vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 - vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 - vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 - vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 - vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] - - pop {pc} - ENDP ; |aom_lpf_vertical_4_neon| - -; void aom_loop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. The function does not use -; registers d8-d15. -; -; Inputs: -; r0-r3, r12 PRESERVE -; d0 blimit -; d1 limit -; d2 thresh -; d3 p3 -; d4 p2 -; d5 p1 -; d6 p0 -; d7 q0 -; d16 q1 -; d17 q2 -; d18 q3 -; -; Outputs: -; d4 op1 -; d5 op0 -; d6 oq0 -; d7 oq1 -|aom_loop_filter_neon| PROC - ; filter_mask - vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) - vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) - vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) - vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) - vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) - vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) - vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) - - vabd.u8 d17, d6, d7 ; abs(p0 - q0) - - vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) - - vmov.u8 d18, #0x80 - - vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) - - ; hevmask - vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) - - vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) - vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 - - veor d7, d7, d18 ; qs0 - - vcge.u8 d23, d1, d23 ; abs(m1) > limit - - ; filter() function - ; convert to signed - - vshr.u8 d28, d28, #1 ; a = a / 2 - veor d6, d6, d18 ; ps0 - - veor d5, d5, d18 ; ps1 - vqadd.u8 d17, d17, d28 ; a = b + a - - veor d16, d16, d18 ; qs1 - - vmov.u8 d19, #3 - - vsub.s8 d28, d7, d6 ; ( qs0 - ps0) - - vcge.u8 d17, d0, d17 ; a > blimit - - vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) - vorr d22, d21, d22 ; hevmask - - vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) - - vand d27, d27, d22 ; filter &= hev - vand d23, d23, d17 ; filter_mask - - vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) - - vmov.u8 d17, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d27, q12 - - vand d27, d27, d23 ; filter &= mask - - vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) - vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) - vshr.s8 d28, d28, #3 ; filter2 >>= 3 - vshr.s8 d27, d27, #3 ; filter1 >>= 3 - - vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) - vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) - - ; outer tap adjustments - vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 - - veor d6, d26, d18 ; *oq0 = u^0x80 - - vbic d27, d27, d22 ; filter &= ~hev - - vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) - vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) - - veor d5, d19, d18 ; *op0 = u^0x80 - veor d4, d21, d18 ; *op1 = u^0x80 - veor d7, d20, d18 ; *oq1 = u^0x80 - - bx lr - ENDP ; |aom_loop_filter_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c deleted file mode 100644 index c4502fdb5..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_8_neon.c +++ /dev/null @@ -1,430 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_dsp_rtcd.h" - -static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p2 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d0ru8, // p1 - uint8x8_t *d1ru8, // p1 - uint8x8_t *d2ru8, // p0 - uint8x8_t *d3ru8, // q0 - uint8x8_t *d4ru8, // q1 - uint8x8_t *d5ru8) { // q1 - uint32_t flat; - uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; - uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; - int16x8_t q15s16; - uint16x8_t q10u16, q14u16; - int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d23u8 = vabd_u8(d17u8, d16u8); - d24u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - - d25u8 = vabd_u8(d6u8, d4u8); - - d23u8 = vmax_u8(d23u8, d24u8); - - d26u8 = vabd_u8(d7u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - - d24u8 = vabd_u8(d6u8, d7u8); - d27u8 = vabd_u8(d3u8, d6u8); - d28u8 = vabd_u8(d18u8, d7u8); - - d19u8 = vmax_u8(d19u8, d23u8); - - d23u8 = vabd_u8(d5u8, d16u8); - d24u8 = vqadd_u8(d24u8, d24u8); - - d19u8 = vcge_u8(dlimit, d19u8); - - d25u8 = vmax_u8(d25u8, d26u8); - d26u8 = vmax_u8(d27u8, d28u8); - - d23u8 = vshr_n_u8(d23u8, 1); - - d25u8 = vmax_u8(d25u8, d26u8); - - d24u8 = vqadd_u8(d24u8, d23u8); - - d20u8 = vmax_u8(d20u8, d25u8); - - d23u8 = vdup_n_u8(1); - d24u8 = vcge_u8(dblimit, d24u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - - d20u8 = vcge_u8(d23u8, d20u8); - - d19u8 = vand_u8(d19u8, d24u8); - - d23u8 = vcgt_u8(d22u8, dthresh); - - d20u8 = vand_u8(d20u8, d19u8); - - d22u8 = vdup_n_u8(0x80); - - d23u8 = vorr_u8(d21u8, d23u8); - - q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8)); - - d30u8 = vshrn_n_u16(q10u16, 4); - flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); - - if (flat == 0xffffffff) { // Check for all 1's, power_branch_only - d27u8 = vdup_n_u8(3); - d21u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d21u8); - q14u16 = vaddw_u8(q14u16, d5u8); - *d0ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - *d1ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - *d2ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d3ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d4ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d5ru8 = vqrshrn_n_u16(q14u16, 3); - } else { - d21u8 = veor_u8(d7u8, d22u8); - d24u8 = veor_u8(d6u8, d22u8); - d25u8 = veor_u8(d5u8, d22u8); - d26u8 = veor_u8(d16u8, d22u8); - - d27u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); - d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); - - q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); - - d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - q15s16 = vaddw_s8(q15s16, d29s8); - - d29u8 = vdup_n_u8(4); - - d28s8 = vqmovn_s16(q15s16); - - d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); - d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); - d30s8 = vshr_n_s8(d30s8, 3); - d29s8 = vshr_n_s8(d29s8, 3); - - d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); - d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); - - d29s8 = vrshr_n_s8(d29s8, 1); - d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); - - if (flat == 0) { // filter_branch_only - *d0ru8 = d4u8; - *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - *d5ru8 = d17u8; - return; - } - - d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - - d23u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d23u8); - - d0u8 = vbsl_u8(d20u8, dblimit, d4u8); - - q14u16 = vaddw_u8(q14u16, d5u8); - - d1u8 = vbsl_u8(d20u8, dlimit, d25u8); - - d30u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - - d2u8 = vbsl_u8(d20u8, dthresh, d24u8); - - d31u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - - *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); - - d23u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - - *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); - - q14u16 = vaddw_u8(q14u16, d18u8); - - *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); - - d22u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - - d3u8 = vbsl_u8(d20u8, d3u8, d21u8); - - q14u16 = vaddw_u8(q14u16, d18u8); - - d4u8 = vbsl_u8(d20u8, d4u8, d26u8); - - d6u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - - d5u8 = vbsl_u8(d20u8, d5u8, d17u8); - - d7u8 = vqrshrn_n_u16(q14u16, 3); - - *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); - *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); - *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); - } - return; -} - -void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < 1; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, - &d5u8); - - s -= (pitch * 6); - vst1_u8(s, d0u8); - s += pitch; - vst1_u8(s, d1u8); - s += pitch; - vst1_u8(s, d2u8); - s += pitch; - vst1_u8(s, d3u8); - s += pitch; - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - } - return; -} - -void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - int i; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - uint8x8x2_t d2Result; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - for (i = 0; i < 1; i++) { - s = src + (i * (pitch << 3)) - 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, - d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, - &d5u8); - - d4Result.val[0] = d0u8; - d4Result.val[1] = d1u8; - d4Result.val[2] = d2u8; - d4Result.val[3] = d3u8; - - d2Result.val[0] = d4u8; - d2Result.val[1] = d5u8; - - s = src - 3; - vst4_lane_u8(s, d4Result, 0); - s += pitch; - vst4_lane_u8(s, d4Result, 1); - s += pitch; - vst4_lane_u8(s, d4Result, 2); - s += pitch; - vst4_lane_u8(s, d4Result, 3); - s += pitch; - vst4_lane_u8(s, d4Result, 4); - s += pitch; - vst4_lane_u8(s, d4Result, 5); - s += pitch; - vst4_lane_u8(s, d4Result, 6); - s += pitch; - vst4_lane_u8(s, d4Result, 7); - - s = src + 1; - vst2_lane_u8(s, d2Result, 0); - s += pitch; - vst2_lane_u8(s, d2Result, 1); - s += pitch; - vst2_lane_u8(s, d2Result, 2); - s += pitch; - vst2_lane_u8(s, d2Result, 3); - s += pitch; - vst2_lane_u8(s, d2Result, 4); - s += pitch; - vst2_lane_u8(s, d2Result, 5); - s += pitch; - vst2_lane_u8(s, d2Result, 6); - s += pitch; - vst2_lane_u8(s, d2Result, 7); - } - return; -} diff --git a/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm b/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm deleted file mode 100644 index 9f3db66ee..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_8_neon_asm.asm +++ /dev/null @@ -1,428 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_8_neon| - EXPORT |aom_lpf_vertical_8_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; Currently aom only works on iterations 8 at a time. The aom loop filter -; works on 16 iterations at a time. -; -; void aom_lpf_horizontal_8_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_horizontal_8_neon| PROC - push {r4-r5, lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r2, [sp, #12] ; load thresh - add r1, r1, r1 ; double pitch - - vld1.8 {d1[]}, [r3] ; duplicate *limit - vld1.8 {d2[]}, [r2] ; duplicate *thresh - - sub r3, r0, r1, lsl #1 ; move src pointer down by 4 lines - add r2, r3, r1, lsr #1 ; set to 3 lines down - - vld1.u8 {d3}, [r3@64], r1 ; p3 - vld1.u8 {d4}, [r2@64], r1 ; p2 - vld1.u8 {d5}, [r3@64], r1 ; p1 - vld1.u8 {d6}, [r2@64], r1 ; p0 - vld1.u8 {d7}, [r3@64], r1 ; q0 - vld1.u8 {d16}, [r2@64], r1 ; q1 - vld1.u8 {d17}, [r3@64] ; q2 - vld1.u8 {d18}, [r2@64], r1 ; q3 - - sub r3, r3, r1, lsl #1 - sub r2, r2, r1, lsl #2 - - bl aom_mbloop_filter_neon - - vst1.u8 {d0}, [r2@64], r1 ; store op2 - vst1.u8 {d1}, [r3@64], r1 ; store op1 - vst1.u8 {d2}, [r2@64], r1 ; store op0 - vst1.u8 {d3}, [r3@64], r1 ; store oq0 - vst1.u8 {d4}, [r2@64], r1 ; store oq1 - vst1.u8 {d5}, [r3@64], r1 ; store oq2 - - pop {r4-r5, pc} - - ENDP ; |aom_lpf_horizontal_8_neon| - -; void aom_lpf_vertical_8_neon(uint8_t *s, -; int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; -; r0 uint8_t *s, -; r1 int pitch, -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_vertical_8_neon| PROC - push {r4-r5, lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - vld1.8 {d1[]}, [r3] ; duplicate *limit - - ldr r3, [sp, #12] ; load thresh - sub r2, r0, #4 ; move s pointer down by 4 columns - - vld1.8 {d2[]}, [r3] ; duplicate *thresh - - vld1.u8 {d3}, [r2], r1 ; load s data - vld1.u8 {d4}, [r2], r1 - vld1.u8 {d5}, [r2], r1 - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d7}, [r2], r1 - vld1.u8 {d16}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d18}, [r2] - - ;transpose to 8x16 matrix - vtrn.32 d3, d7 - vtrn.32 d4, d16 - vtrn.32 d5, d17 - vtrn.32 d6, d18 - - vtrn.16 d3, d5 - vtrn.16 d4, d6 - vtrn.16 d7, d17 - vtrn.16 d16, d18 - - vtrn.8 d3, d4 - vtrn.8 d5, d6 - vtrn.8 d7, d16 - vtrn.8 d17, d18 - - sub r2, r0, #3 - add r3, r0, #1 - - bl aom_mbloop_filter_neon - - ;store op2, op1, op0, oq0 - vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1 - vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1 - vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1 - vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1 - vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1 - vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1 - vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1 - vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2] - - ;store oq1, oq2 - vst2.8 {d4[0], d5[0]}, [r3], r1 - vst2.8 {d4[1], d5[1]}, [r3], r1 - vst2.8 {d4[2], d5[2]}, [r3], r1 - vst2.8 {d4[3], d5[3]}, [r3], r1 - vst2.8 {d4[4], d5[4]}, [r3], r1 - vst2.8 {d4[5], d5[5]}, [r3], r1 - vst2.8 {d4[6], d5[6]}, [r3], r1 - vst2.8 {d4[7], d5[7]}, [r3] - - pop {r4-r5, pc} - ENDP ; |aom_lpf_vertical_8_neon| - -; void aom_mbloop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. The function does not use -; registers d8-d15. -; -; Inputs: -; r0-r3, r12 PRESERVE -; d0 blimit -; d1 limit -; d2 thresh -; d3 p3 -; d4 p2 -; d5 p1 -; d6 p0 -; d7 q0 -; d16 q1 -; d17 q2 -; d18 q3 -; -; Outputs: -; d0 op2 -; d1 op1 -; d2 op0 -; d3 oq0 -; d4 oq1 -; d5 oq2 -|aom_mbloop_filter_neon| PROC - ; filter_mask - vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) - vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) - vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) - vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) - vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1) - vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) - vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) - - vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2) - - vmax.u8 d23, d23, d24 ; m3 = max(m5, m6) - - vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2) - - vmax.u8 d19, d19, d20 - - vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0) - vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0) - vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0) - - vmax.u8 d19, d19, d23 - - vabd.u8 d23, d5, d16 ; a = abs(p1 - q1) - vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 - - ; abs () > limit - vcge.u8 d19, d1, d19 - - ; only compare the largest value to thresh - vmax.u8 d25, d25, d26 ; m4 = max(m7, m8) - vmax.u8 d26, d27, d28 ; m5 = max(m10, m11) - - vshr.u8 d23, d23, #1 ; a = a / 2 - - vmax.u8 d25, d25, d26 ; m4 = max(m4, m5) - - vqadd.u8 d24, d24, d23 ; a = b + a - - vmax.u8 d20, d20, d25 ; m2 = max(m2, m4) - - vmov.u8 d23, #1 - vcge.u8 d24, d0, d24 ; a > blimit - - vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 - - vcge.u8 d20, d23, d20 ; flat - - vand d19, d19, d24 ; mask - - vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1 - - vand d20, d20, d19 ; flat & mask - - vmov.u8 d22, #0x80 - - vorr d23, d21, d23 ; hev - - ; This instruction will truncate the "flat & mask" masks down to 4 bits - ; each to fit into one 32 bit arm register. The values are stored in - ; q10.64[0]. - vshrn.u16 d30, q10, #4 - vmov.u32 r4, d30[0] ; flat & mask 4bits - - adds r5, r4, #1 ; Check for all 1's - - ; If mask and flat are 1's for all vectors, then we only need to execute - ; the power branch for all vectors. - beq power_branch_only - - cmp r4, #0 ; Check for 0, set flag for later - - ; mbfilter() function - ; filter() function - ; convert to signed - veor d21, d7, d22 ; qs0 - veor d24, d6, d22 ; ps0 - veor d25, d5, d22 ; ps1 - veor d26, d16, d22 ; qs1 - - vmov.u8 d27, #3 - - vsub.s8 d28, d21, d24 ; ( qs0 - ps0) - - vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) - - vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) - - vand d29, d29, d23 ; filter &= hev - - vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) - - vmov.u8 d29, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d28, q15 - - vand d28, d28, d19 ; filter &= mask - - vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) - vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) - vshr.s8 d30, d30, #3 ; filter2 >>= 3 - vshr.s8 d29, d29, #3 ; filter1 >>= 3 - - vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) - vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1) - - ; outer tap adjustments: ++filter1 >> 1 - vrshr.s8 d29, d29, #1 - vbic d29, d29, d23 ; filter &= ~hev - - vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) - vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) - - ; If mask and flat are 0's for all vectors, then we only need to execute - ; the filter branch for all vectors. - beq filter_branch_only - - ; If mask and flat are mixed then we must perform both branches and - ; combine the data. - veor d24, d24, d22 ; *f_op0 = u^0x80 - veor d21, d21, d22 ; *f_oq0 = u^0x80 - veor d25, d25, d22 ; *f_op1 = u^0x80 - veor d26, d26, d22 ; *f_oq1 = u^0x80 - - ; At this point we have already executed the filter branch. The filter - ; branch does not set op2 or oq2, so use p2 and q2. Execute the power - ; branch and combine the data. - vmov.u8 d23, #2 - vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0 - vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3 - vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2 - - vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask) - - vaddw.u8 q14, d5 ; r_op2 += p1 - - vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask) - - vqrshrn.u16 d30, q14, #3 ; r_op2 - - vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3 - vsubw.u8 q14, d4 ; r_op1 -= p2 - vaddw.u8 q14, d5 ; r_op1 += p1 - vaddw.u8 q14, d16 ; r_op1 += q1 - - vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask) - - vqrshrn.u16 d31, q14, #3 ; r_op1 - - vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3 - vsubw.u8 q14, d5 ; r_op0 -= p1 - vaddw.u8 q14, d6 ; r_op0 += p0 - vaddw.u8 q14, d17 ; r_op0 += q2 - - vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask) - - vqrshrn.u16 d23, q14, #3 ; r_op0 - - vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3 - vsubw.u8 q14, d6 ; r_oq0 -= p0 - vaddw.u8 q14, d7 ; r_oq0 += q0 - - vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask) - - vaddw.u8 q14, d18 ; oq0 += q3 - - vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask) - - vqrshrn.u16 d22, q14, #3 ; r_oq0 - - vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2 - vsubw.u8 q14, d7 ; r_oq1 -= q0 - vaddw.u8 q14, d16 ; r_oq1 += q1 - - vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask) - - vaddw.u8 q14, d18 ; r_oq1 += q3 - - vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) - - vqrshrn.u16 d6, q14, #3 ; r_oq1 - - vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1 - vsubw.u8 q14, d16 ; r_oq2 -= q1 - vaddw.u8 q14, d17 ; r_oq2 += q2 - vaddw.u8 q14, d18 ; r_oq2 += q3 - - vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask) - - vqrshrn.u16 d7, q14, #3 ; r_oq2 - - vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask) - vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask) - vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask) - - bx lr - -power_branch_only - vmov.u8 d27, #3 - vmov.u8 d21, #2 - vaddl.u8 q14, d6, d7 ; op2 = p0 + q0 - vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 - vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 - vaddw.u8 q14, d5 ; op2 += p1 - vqrshrn.u16 d0, q14, #3 ; op2 - - vsubw.u8 q14, d3 ; op1 = op2 - p3 - vsubw.u8 q14, d4 ; op1 -= p2 - vaddw.u8 q14, d5 ; op1 += p1 - vaddw.u8 q14, d16 ; op1 += q1 - vqrshrn.u16 d1, q14, #3 ; op1 - - vsubw.u8 q14, d3 ; op0 = op1 - p3 - vsubw.u8 q14, d5 ; op0 -= p1 - vaddw.u8 q14, d6 ; op0 += p0 - vaddw.u8 q14, d17 ; op0 += q2 - vqrshrn.u16 d2, q14, #3 ; op0 - - vsubw.u8 q14, d3 ; oq0 = op0 - p3 - vsubw.u8 q14, d6 ; oq0 -= p0 - vaddw.u8 q14, d7 ; oq0 += q0 - vaddw.u8 q14, d18 ; oq0 += q3 - vqrshrn.u16 d3, q14, #3 ; oq0 - - vsubw.u8 q14, d4 ; oq1 = oq0 - p2 - vsubw.u8 q14, d7 ; oq1 -= q0 - vaddw.u8 q14, d16 ; oq1 += q1 - vaddw.u8 q14, d18 ; oq1 += q3 - vqrshrn.u16 d4, q14, #3 ; oq1 - - vsubw.u8 q14, d5 ; oq2 = oq1 - p1 - vsubw.u8 q14, d16 ; oq2 -= q1 - vaddw.u8 q14, d17 ; oq2 += q2 - vaddw.u8 q14, d18 ; oq2 += q3 - vqrshrn.u16 d5, q14, #3 ; oq2 - - bx lr - -filter_branch_only - ; TODO(fgalligan): See if we can rearange registers so we do not need to - ; do the 2 vswp. - vswp d0, d4 ; op2 - vswp d5, d17 ; oq2 - veor d2, d24, d22 ; *op0 = u^0x80 - veor d3, d21, d22 ; *oq0 = u^0x80 - veor d1, d25, d22 ; *op1 = u^0x80 - veor d4, d26, d22 ; *oq1 = u^0x80 - - bx lr - - ENDP ; |aom_mbloop_filter_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm b/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm deleted file mode 100644 index 675928860..000000000 --- a/third_party/aom/aom_dsp/arm/loopfilter_mb_neon.asm +++ /dev/null @@ -1,638 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - EXPORT |aom_lpf_horizontal_edge_8_neon| - EXPORT |aom_lpf_horizontal_edge_16_neon| - EXPORT |aom_lpf_vertical_16_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; void mb_lpf_horizontal_edge(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -; r12 int count -|mb_lpf_horizontal_edge| PROC - push {r4-r8, lr} - vpush {d8-d15} - ldr r4, [sp, #88] ; load thresh - -h_count - vld1.8 {d16[]}, [r2] ; load *blimit - vld1.8 {d17[]}, [r3] ; load *limit - vld1.8 {d18[]}, [r4] ; load *thresh - - sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines - - vld1.u8 {d0}, [r8@64], r1 ; p7 - vld1.u8 {d1}, [r8@64], r1 ; p6 - vld1.u8 {d2}, [r8@64], r1 ; p5 - vld1.u8 {d3}, [r8@64], r1 ; p4 - vld1.u8 {d4}, [r8@64], r1 ; p3 - vld1.u8 {d5}, [r8@64], r1 ; p2 - vld1.u8 {d6}, [r8@64], r1 ; p1 - vld1.u8 {d7}, [r8@64], r1 ; p0 - vld1.u8 {d8}, [r8@64], r1 ; q0 - vld1.u8 {d9}, [r8@64], r1 ; q1 - vld1.u8 {d10}, [r8@64], r1 ; q2 - vld1.u8 {d11}, [r8@64], r1 ; q3 - vld1.u8 {d12}, [r8@64], r1 ; q4 - vld1.u8 {d13}, [r8@64], r1 ; q5 - vld1.u8 {d14}, [r8@64], r1 ; q6 - vld1.u8 {d15}, [r8@64], r1 ; q7 - - bl aom_wide_mbfilter_neon - - tst r7, #1 - beq h_mbfilter - - ; flat && mask were not set for any of the channels. Just store the values - ; from filter. - sub r8, r0, r1, lsl #1 - - vst1.u8 {d25}, [r8@64], r1 ; store op1 - vst1.u8 {d24}, [r8@64], r1 ; store op0 - vst1.u8 {d23}, [r8@64], r1 ; store oq0 - vst1.u8 {d26}, [r8@64], r1 ; store oq1 - - b h_next - -h_mbfilter - tst r7, #2 - beq h_wide_mbfilter - - ; flat2 was not set for any of the channels. Just store the values from - ; mbfilter. - sub r8, r0, r1, lsl #1 - sub r8, r8, r1 - - vst1.u8 {d18}, [r8@64], r1 ; store op2 - vst1.u8 {d19}, [r8@64], r1 ; store op1 - vst1.u8 {d20}, [r8@64], r1 ; store op0 - vst1.u8 {d21}, [r8@64], r1 ; store oq0 - vst1.u8 {d22}, [r8@64], r1 ; store oq1 - vst1.u8 {d23}, [r8@64], r1 ; store oq2 - - b h_next - -h_wide_mbfilter - sub r8, r0, r1, lsl #3 - add r8, r8, r1 - - vst1.u8 {d16}, [r8@64], r1 ; store op6 - vst1.u8 {d24}, [r8@64], r1 ; store op5 - vst1.u8 {d25}, [r8@64], r1 ; store op4 - vst1.u8 {d26}, [r8@64], r1 ; store op3 - vst1.u8 {d27}, [r8@64], r1 ; store op2 - vst1.u8 {d18}, [r8@64], r1 ; store op1 - vst1.u8 {d19}, [r8@64], r1 ; store op0 - vst1.u8 {d20}, [r8@64], r1 ; store oq0 - vst1.u8 {d21}, [r8@64], r1 ; store oq1 - vst1.u8 {d22}, [r8@64], r1 ; store oq2 - vst1.u8 {d23}, [r8@64], r1 ; store oq3 - vst1.u8 {d1}, [r8@64], r1 ; store oq4 - vst1.u8 {d2}, [r8@64], r1 ; store oq5 - vst1.u8 {d3}, [r8@64], r1 ; store oq6 - -h_next - add r0, r0, #8 - subs r12, r12, #1 - bne h_count - - vpop {d8-d15} - pop {r4-r8, pc} - - ENDP ; |mb_lpf_horizontal_edge| - -; void aom_lpf_horizontal_edge_8_neon(uint8_t *s, int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int pitch, -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh -|aom_lpf_horizontal_edge_8_neon| PROC - mov r12, #1 - b mb_lpf_horizontal_edge - ENDP ; |aom_lpf_horizontal_edge_8_neon| - -; void aom_lpf_horizontal_edge_16_neon(uint8_t *s, int pitch, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int pitch, -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh -|aom_lpf_horizontal_edge_16_neon| PROC - mov r12, #2 - b mb_lpf_horizontal_edge - ENDP ; |aom_lpf_horizontal_edge_16_neon| - -; void aom_lpf_vertical_16_neon(uint8_t *s, int p, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh) -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -|aom_lpf_vertical_16_neon| PROC - push {r4-r8, lr} - vpush {d8-d15} - ldr r4, [sp, #88] ; load thresh - - vld1.8 {d16[]}, [r2] ; load *blimit - vld1.8 {d17[]}, [r3] ; load *limit - vld1.8 {d18[]}, [r4] ; load *thresh - - sub r8, r0, #8 - - vld1.8 {d0}, [r8@64], r1 - vld1.8 {d8}, [r0@64], r1 - vld1.8 {d1}, [r8@64], r1 - vld1.8 {d9}, [r0@64], r1 - vld1.8 {d2}, [r8@64], r1 - vld1.8 {d10}, [r0@64], r1 - vld1.8 {d3}, [r8@64], r1 - vld1.8 {d11}, [r0@64], r1 - vld1.8 {d4}, [r8@64], r1 - vld1.8 {d12}, [r0@64], r1 - vld1.8 {d5}, [r8@64], r1 - vld1.8 {d13}, [r0@64], r1 - vld1.8 {d6}, [r8@64], r1 - vld1.8 {d14}, [r0@64], r1 - vld1.8 {d7}, [r8@64], r1 - vld1.8 {d15}, [r0@64], r1 - - sub r0, r0, r1, lsl #3 - - vtrn.32 q0, q2 - vtrn.32 q1, q3 - vtrn.32 q4, q6 - vtrn.32 q5, q7 - - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - vtrn.8 d0, d1 - vtrn.8 d2, d3 - vtrn.8 d4, d5 - vtrn.8 d6, d7 - - vtrn.8 d8, d9 - vtrn.8 d10, d11 - vtrn.8 d12, d13 - vtrn.8 d14, d15 - - bl aom_wide_mbfilter_neon - - tst r7, #1 - beq v_mbfilter - - ; flat && mask were not set for any of the channels. Just store the values - ; from filter. - sub r8, r0, #2 - - vswp d23, d25 - - vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 - vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 - vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 - vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 - vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 - vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 - vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 - vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 - - b v_end - -v_mbfilter - tst r7, #2 - beq v_wide_mbfilter - - ; flat2 was not set for any of the channels. Just store the values from - ; mbfilter. - sub r8, r0, #3 - - vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 - vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 - vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 - vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 - vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 - vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 - vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 - vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 - vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 - vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 - vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 - vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 - vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 - vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 - vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 - vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 - - b v_end - -v_wide_mbfilter - sub r8, r0, #8 - - vtrn.32 d0, d26 - vtrn.32 d16, d27 - vtrn.32 d24, d18 - vtrn.32 d25, d19 - - vtrn.16 d0, d24 - vtrn.16 d16, d25 - vtrn.16 d26, d18 - vtrn.16 d27, d19 - - vtrn.8 d0, d16 - vtrn.8 d24, d25 - vtrn.8 d26, d27 - vtrn.8 d18, d19 - - vtrn.32 d20, d1 - vtrn.32 d21, d2 - vtrn.32 d22, d3 - vtrn.32 d23, d15 - - vtrn.16 d20, d22 - vtrn.16 d21, d23 - vtrn.16 d1, d3 - vtrn.16 d2, d15 - - vtrn.8 d20, d21 - vtrn.8 d22, d23 - vtrn.8 d1, d2 - vtrn.8 d3, d15 - - vst1.8 {d0}, [r8@64], r1 - vst1.8 {d20}, [r0@64], r1 - vst1.8 {d16}, [r8@64], r1 - vst1.8 {d21}, [r0@64], r1 - vst1.8 {d24}, [r8@64], r1 - vst1.8 {d22}, [r0@64], r1 - vst1.8 {d25}, [r8@64], r1 - vst1.8 {d23}, [r0@64], r1 - vst1.8 {d26}, [r8@64], r1 - vst1.8 {d1}, [r0@64], r1 - vst1.8 {d27}, [r8@64], r1 - vst1.8 {d2}, [r0@64], r1 - vst1.8 {d18}, [r8@64], r1 - vst1.8 {d3}, [r0@64], r1 - vst1.8 {d19}, [r8@64], r1 - vst1.8 {d15}, [r0@64], r1 - -v_end - vpop {d8-d15} - pop {r4-r8, pc} - - ENDP ; |aom_lpf_vertical_16_neon| - -; void aom_wide_mbfilter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. -; -; r0-r3 PRESERVE -; d16 blimit -; d17 limit -; d18 thresh -; d0 p7 -; d1 p6 -; d2 p5 -; d3 p4 -; d4 p3 -; d5 p2 -; d6 p1 -; d7 p0 -; d8 q0 -; d9 q1 -; d10 q2 -; d11 q3 -; d12 q4 -; d13 q5 -; d14 q6 -; d15 q7 -|aom_wide_mbfilter_neon| PROC - mov r7, #0 - - ; filter_mask - vabd.u8 d19, d4, d5 ; abs(p3 - p2) - vabd.u8 d20, d5, d6 ; abs(p2 - p1) - vabd.u8 d21, d6, d7 ; abs(p1 - p0) - vabd.u8 d22, d9, d8 ; abs(q1 - q0) - vabd.u8 d23, d10, d9 ; abs(q2 - q1) - vabd.u8 d24, d11, d10 ; abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1)) - vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0)) - vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2)) - vmax.u8 d19, d19, d20 - - vabd.u8 d24, d7, d8 ; abs(p0 - q0) - - vmax.u8 d19, d19, d23 - - vabd.u8 d23, d6, d9 ; a = abs(p1 - q1) - vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 - - ; abs () > limit - vcge.u8 d19, d17, d19 - - ; flatmask4 - vabd.u8 d25, d7, d5 ; abs(p0 - p2) - vabd.u8 d26, d8, d10 ; abs(q0 - q2) - vabd.u8 d27, d4, d7 ; abs(p3 - p0) - vabd.u8 d28, d11, d8 ; abs(q3 - q0) - - ; only compare the largest value to thresh - vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2)) - vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0)) - vmax.u8 d25, d25, d26 - vmax.u8 d20, d20, d25 - - vshr.u8 d23, d23, #1 ; a = a / 2 - vqadd.u8 d24, d24, d23 ; a = b + a - - vmov.u8 d30, #1 - vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1 - - vcge.u8 d20, d30, d20 ; flat - - vand d19, d19, d24 ; mask - - ; hevmask - vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1 - vorr d21, d21, d22 ; hev - - vand d16, d20, d19 ; flat && mask - vmov r5, r6, d16 - - ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) - vabd.u8 d22, d3, d7 ; abs(p4 - p0) - vabd.u8 d23, d12, d8 ; abs(q4 - q0) - vabd.u8 d24, d7, d2 ; abs(p0 - p5) - vabd.u8 d25, d8, d13 ; abs(q0 - q5) - vabd.u8 d26, d1, d7 ; abs(p6 - p0) - vabd.u8 d27, d14, d8 ; abs(q6 - q0) - vabd.u8 d28, d0, d7 ; abs(p7 - p0) - vabd.u8 d29, d15, d8 ; abs(q7 - q0) - - ; only compare the largest value to thresh - vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0)) - vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5)) - vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0)) - vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0)) - - vmax.u8 d26, d22, d23 - vmax.u8 d27, d24, d25 - vmax.u8 d23, d26, d27 - - vcge.u8 d18, d30, d23 ; flat2 - - vmov.u8 d22, #0x80 - - orrs r5, r5, r6 ; Check for 0 - orreq r7, r7, #1 ; Only do filter branch - - vand d17, d18, d16 ; flat2 && flat && mask - vmov r5, r6, d17 - - ; mbfilter() function - - ; filter() function - ; convert to signed - veor d23, d8, d22 ; qs0 - veor d24, d7, d22 ; ps0 - veor d25, d6, d22 ; ps1 - veor d26, d9, d22 ; qs1 - - vmov.u8 d27, #3 - - vsub.s8 d28, d23, d24 ; ( qs0 - ps0) - vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) - vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) - vand d29, d29, d21 ; filter &= hev - vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) - vmov.u8 d29, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d28, q15 - - vand d28, d28, d19 ; filter &= mask - - vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) - vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) - vshr.s8 d30, d30, #3 ; filter2 >>= 3 - vshr.s8 d29, d29, #3 ; filter1 >>= 3 - - - vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) - vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1) - - ; outer tap adjustments: ++filter1 >> 1 - vrshr.s8 d29, d29, #1 - vbic d29, d29, d21 ; filter &= ~hev - - vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) - vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) - - veor d24, d24, d22 ; *f_op0 = u^0x80 - veor d23, d23, d22 ; *f_oq0 = u^0x80 - veor d25, d25, d22 ; *f_op1 = u^0x80 - veor d26, d26, d22 ; *f_oq1 = u^0x80 - - tst r7, #1 - bxne lr - - orrs r5, r5, r6 ; Check for 0 - orreq r7, r7, #2 ; Only do mbfilter branch - - ; mbfilter flat && mask branch - ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's - ; and using vibt on the q's? - vmov.u8 d29, #2 - vaddl.u8 q15, d7, d8 ; op2 = p0 + q0 - vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3 - vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2 - vaddl.u8 q10, d4, d5 - vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2 - vaddl.u8 q14, d6, d9 - vqrshrn.u16 d18, q15, #3 ; r_op2 - - vsub.i16 q15, q10 - vaddl.u8 q10, d4, d6 - vadd.i16 q15, q14 - vaddl.u8 q14, d7, d10 - vqrshrn.u16 d19, q15, #3 ; r_op1 - - vsub.i16 q15, q10 - vadd.i16 q15, q14 - vaddl.u8 q14, d8, d11 - vqrshrn.u16 d20, q15, #3 ; r_op0 - - vsubw.u8 q15, d4 ; oq0 = op0 - p3 - vsubw.u8 q15, d7 ; oq0 -= p0 - vadd.i16 q15, q14 - vaddl.u8 q14, d9, d11 - vqrshrn.u16 d21, q15, #3 ; r_oq0 - - vsubw.u8 q15, d5 ; oq1 = oq0 - p2 - vsubw.u8 q15, d8 ; oq1 -= q0 - vadd.i16 q15, q14 - vaddl.u8 q14, d10, d11 - vqrshrn.u16 d22, q15, #3 ; r_oq1 - - vsubw.u8 q15, d6 ; oq2 = oq0 - p1 - vsubw.u8 q15, d9 ; oq2 -= q1 - vadd.i16 q15, q14 - vqrshrn.u16 d27, q15, #3 ; r_oq2 - - ; Filter does not set op2 or oq2, so use p2 and q2. - vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask) - vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask) - vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask) - vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask) - vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask) - - vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask) - vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask) - - tst r7, #2 - bxne lr - - ; wide_mbfilter flat2 && flat && mask branch - vmov.u8 d16, #7 - vaddl.u8 q15, d7, d8 ; op6 = p0 + q0 - vaddl.u8 q12, d2, d3 - vaddl.u8 q13, d4, d5 - vaddl.u8 q14, d1, d6 - vmlal.u8 q15, d0, d16 ; op6 += p7 * 3 - vadd.i16 q12, q13 - vadd.i16 q15, q14 - vaddl.u8 q14, d2, d9 - vadd.i16 q15, q12 - vaddl.u8 q12, d0, d1 - vaddw.u8 q15, d1 - vaddl.u8 q13, d0, d2 - vadd.i16 q14, q15, q14 - vqrshrn.u16 d16, q15, #4 ; w_op6 - - vsub.i16 q15, q14, q12 - vaddl.u8 q14, d3, d10 - vqrshrn.u16 d24, q15, #4 ; w_op5 - - vsub.i16 q15, q13 - vaddl.u8 q13, d0, d3 - vadd.i16 q15, q14 - vaddl.u8 q14, d4, d11 - vqrshrn.u16 d25, q15, #4 ; w_op4 - - vadd.i16 q15, q14 - vaddl.u8 q14, d0, d4 - vsub.i16 q15, q13 - vsub.i16 q14, q15, q14 - vqrshrn.u16 d26, q15, #4 ; w_op3 - - vaddw.u8 q15, q14, d5 ; op2 += p2 - vaddl.u8 q14, d0, d5 - vaddw.u8 q15, d12 ; op2 += q4 - vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) - vqrshrn.u16 d27, q15, #4 ; w_op2 - - vsub.i16 q15, q14 - vaddl.u8 q14, d0, d6 - vaddw.u8 q15, d6 ; op1 += p1 - vaddw.u8 q15, d13 ; op1 += q5 - vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) - vqrshrn.u16 d18, q15, #4 ; w_op1 - - vsub.i16 q15, q14 - vaddl.u8 q14, d0, d7 - vaddw.u8 q15, d7 ; op0 += p0 - vaddw.u8 q15, d14 ; op0 += q6 - vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) - vqrshrn.u16 d19, q15, #4 ; w_op0 - - vsub.i16 q15, q14 - vaddl.u8 q14, d1, d8 - vaddw.u8 q15, d8 ; oq0 += q0 - vaddw.u8 q15, d15 ; oq0 += q7 - vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) - vqrshrn.u16 d20, q15, #4 ; w_oq0 - - vsub.i16 q15, q14 - vaddl.u8 q14, d2, d9 - vaddw.u8 q15, d9 ; oq1 += q1 - vaddl.u8 q4, d10, d15 - vaddw.u8 q15, d15 ; oq1 += q7 - vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) - vqrshrn.u16 d21, q15, #4 ; w_oq1 - - vsub.i16 q15, q14 - vaddl.u8 q14, d3, d10 - vadd.i16 q15, q4 - vaddl.u8 q4, d11, d15 - vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m) - vqrshrn.u16 d22, q15, #4 ; w_oq2 - - vsub.i16 q15, q14 - vaddl.u8 q14, d4, d11 - vadd.i16 q15, q4 - vaddl.u8 q4, d12, d15 - vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m) - vqrshrn.u16 d23, q15, #4 ; w_oq3 - - vsub.i16 q15, q14 - vaddl.u8 q14, d5, d12 - vadd.i16 q15, q4 - vaddl.u8 q4, d13, d15 - vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m) - vqrshrn.u16 d1, q15, #4 ; w_oq4 - - vsub.i16 q15, q14 - vaddl.u8 q14, d6, d13 - vadd.i16 q15, q4 - vaddl.u8 q4, d14, d15 - vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m) - vqrshrn.u16 d2, q15, #4 ; w_oq5 - - vsub.i16 q15, q14 - vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m) - vadd.i16 q15, q4 - vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m) - vqrshrn.u16 d3, q15, #4 ; w_oq6 - vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m) - vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m) - vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m) - - bx lr - ENDP ; |aom_wide_mbfilter_neon| - - END diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c index c90d6bfde..ee1a3c78f 100644 --- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c +++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * Copyright (c) 2018, Alliance for Open Media. All rights reserved * * This source code is subject to the terms of the BSD 2 Clause License and * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License @@ -11,39 +11,690 @@ #include -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" +#include "av1/common/arm/mem_neon.h" +#include "av1/common/arm/transpose_neon.h" + +static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0, const uint8_t blimit, + const uint8_t limit) { + // Calculate mask values for four samples + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + uint8x8_t mask_8x8, temp_8x8; + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); + + mask_8x8 = vabd_u8(p3q3, p2q2); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1)); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + +static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2, + uint8x8_t p1q1, uint8x8_t p0q0) { + const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 + uint8x8_t flat_8x8, temp_8x8; + + flat_8x8 = vabd_u8(p1q1, p0q0); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0)); + flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); + flat_8x8 = vand_u8(flat_8x8, temp_8x8); + + return flat_8x8; +} + +static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0) { + const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 + uint8x8_t flat_8x8, temp_8x8; + + flat_8x8 = vabd_u8(p1q1, p0q0); + flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0)); + flat_8x8 = vcle_u8(flat_8x8, thresh_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8))); + flat_8x8 = vand_u8(flat_8x8, temp_8x8); + + return flat_8x8; +} + +static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1, + uint8x8_t p0q0, const uint8_t blimit, + const uint8_t limit) { + // Calculate mask3 values for four samples + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + uint8x8_t mask_8x8, temp_8x8; + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit); + + mask_8x8 = vabd_u8(p2q2, p1q1); + mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0)); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + +static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4, + uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, + uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4, + out_f14_pq5; + uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8, flat2_8x8; + uint8x8_t q0p0, q1p1, q2p2; + + // Calculate filter masks + mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); + flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + hev_8x8 = vmvn_s8(hev_8x8); + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + // reverse p and q + q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); + { + // filter 8 + uint16x8_t out_pq0, out_pq1, out_pq2; + out = vaddl_u8(*p3q3, *p2q2); + out = vaddw_u8(out, *p1q1); + out = vaddw_u8(out, *p0q0); + + out = vaddw_u8(out, q0p0); + out_pq1 = vaddw_u8(out, *p3q3); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + out_pq2 = vaddw_u8(out_pq2, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p1q1); + out_pq1 = vaddw_u8(out_pq1, q1p1); + + out_pq0 = vaddw_u8(out, *p0q0); + out_pq0 = vaddw_u8(out_pq0, q1p1); + out_pq0 = vaddw_u8(out_pq0, q2p2); + + out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); + out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); + } + { + // filter 14 + uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5; + uint16x8_t p6q6_2, p6q6_temp, qp_sum; + uint8x8_t qp_rev; + + out = vaddw_u8(out, *p4q4); + out = vaddw_u8(out, *p5q5); + out = vaddw_u8(out, *p6q6); + + out_pq5 = vaddw_u8(out, *p4q4); + out_pq4 = vaddw_u8(out_pq5, *p3q3); + out_pq3 = vaddw_u8(out_pq4, *p2q2); + + out_pq5 = vaddw_u8(out_pq5, *p5q5); + out_pq4 = vaddw_u8(out_pq4, *p5q5); + + out_pq0 = vaddw_u8(out, *p1q1); + out_pq1 = vaddw_u8(out_pq0, *p2q2); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + + out_pq0 = vaddw_u8(out_pq0, *p0q0); + out_pq1 = vaddw_u8(out_pq1, *p0q0); + + out_pq1 = vaddw_u8(out_pq1, *p6q6); + p6q6_2 = vaddl_u8(*p6q6, *p6q6); + out_pq2 = vaddq_u16(out_pq2, p6q6_2); + p6q6_temp = vaddw_u8(p6q6_2, *p6q6); + out_pq3 = vaddq_u16(out_pq3, p6q6_temp); + p6q6_temp = vaddw_u8(p6q6_temp, *p6q6); + out_pq4 = vaddq_u16(out_pq4, p6q6_temp); + p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2); + out_pq5 = vaddq_u16(out_pq5, p6q6_temp); + + out_pq4 = vaddw_u8(out_pq4, q1p1); + + qp_sum = vaddl_u8(q2p2, q1p1); + out_pq3 = vaddq_u16(out_pq3, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq2 = vaddq_u16(out_pq2, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq1 = vaddq_u16(out_pq1, qp_sum); + + qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5))); + qp_sum = vaddw_u8(qp_sum, qp_rev); + out_pq0 = vaddq_u16(out_pq0, qp_sum); + + out_pq0 = vaddw_u8(out_pq0, q0p0); + + out_f14_pq0 = vrshrn_n_u16(out_pq0, 4); + out_f14_pq1 = vrshrn_n_u16(out_pq1, 4); + out_f14_pq2 = vrshrn_n_u16(out_pq2, 4); + out_f14_pq3 = vrshrn_n_u16(out_pq3, 4); + out_f14_pq4 = vrshrn_n_u16(out_pq4, 4); + out_f14_pq5 = vrshrn_n_u16(out_pq5, 4); + } + { + uint8x8_t filter4_cond, filter8_cond, filter14_cond; + filter8_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter8_cond); + filter14_cond = vand_u8(filter8_cond, flat2_8x8); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + + // filter14 outputs + *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0); + *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1); + *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2); + *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3); + *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4); + *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5); + } +} + +static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1, + uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8; + + // Calculate filter masks + mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); -void aom_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); - aom_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + hev_8x8 = vmvn_s8(hev_8x8); + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + { + // filter 8 + uint16x8_t out_pq0, out_pq1, out_pq2; + uint8x8_t q0p0, q1p1, q2p2; + + out = vaddl_u8(*p3q3, *p2q2); + out = vaddw_u8(out, *p1q1); + out = vaddw_u8(out, *p0q0); + + // reverse p and q + q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2))); + + out = vaddw_u8(out, q0p0); + out_pq1 = vaddw_u8(out, *p3q3); + out_pq2 = vaddw_u8(out_pq1, *p3q3); + out_pq2 = vaddw_u8(out_pq2, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p1q1); + out_pq1 = vaddw_u8(out_pq1, q1p1); + + out_pq0 = vaddw_u8(out, *p0q0); + out_pq0 = vaddw_u8(out_pq0, q1p1); + out_pq0 = vaddw_u8(out_pq0, q2p2); + + out_f7_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f7_pq1 = vrshrn_n_u16(out_pq1, 3); + out_f7_pq2 = vrshrn_n_u16(out_pq2, 3); + } + { + uint8x8_t filter4_cond, filter8_cond; + filter8_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter8_cond); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter8 outputs + *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0); + *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1); + *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2); + } +} + +static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, + const uint8_t blimit, const uint8_t limit, + const uint8_t thresh) { + uint16x8_t out; + uint8x8_t out_f6_pq0, out_f6_pq1; + uint8x8_t out_f4_pq0, out_f4_pq1; + uint8x8_t mask_8x8, flat_8x8; + + // Calculate filter masks + mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit); + flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0); + { + // filter 4 + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vbic_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); + } + { + // filter 6 + uint16x8_t out_pq0, out_pq1; + uint8x8_t pq_rev; + + out = vaddl_u8(*p0q0, *p1q1); + out = vaddq_u16(out, out); + out = vaddw_u8(out, *p2q2); + + pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0))); + out = vaddw_u8(out, pq_rev); + + out_pq0 = vaddw_u8(out, pq_rev); + pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1))); + out_pq0 = vaddw_u8(out_pq0, pq_rev); + + out_pq1 = vaddw_u8(out, *p2q2); + out_pq1 = vaddw_u8(out_pq1, *p2q2); + + out_f6_pq0 = vrshrn_n_u16(out_pq0, 3); + out_f6_pq1 = vrshrn_n_u16(out_pq1, 3); + } + { + uint8x8_t filter4_cond, filter6_cond; + filter6_cond = vand_u8(flat_8x8, mask_8x8); + filter4_cond = vmvn_u8(filter6_cond); + + // filter4 outputs + *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0); + *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1); + + // filter6 outputs + *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0); + *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1); + } +} + +void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x16_t row0, row1, row2, row3; + uint8x8_t pxp3, p6p2, p5p1, p4p0; + uint8x8_t q0q4, q1q5, q2q6, q3qy; + uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3; + uint32x2_t pq_rev; + uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6; + + // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y + load_u8_8x16(src - 8, stride, &row0, &row1, &row2, &row3); + + pxp3 = vget_low_u8(row0); + p6p2 = vget_low_u8(row1); + p5p1 = vget_low_u8(row2); + p4p0 = vget_low_u8(row3); + transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); + + q0q4 = vget_high_u8(row0); + q1q5 = vget_high_u8(row1); + q2q6 = vget_high_u8(row2); + q3qy = vget_high_u8(row3); + transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy)); + pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5)); + p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4)); + p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6)); + p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev); + + p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); + p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); + p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); + p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]); + p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); + p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); + p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); + + lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, + *thresh); + + pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3)); + p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1)); + p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0)); + p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2)); + + pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]); + p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]); + p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]); + p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]); + + q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]); + q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]); + q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]); + q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]); + transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy); + + pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]); + p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]); + p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]); + p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]); + transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0); + + row0 = vcombine_u8(pxp3, q0q4); + row1 = vcombine_u8(p6p2, q1q5); + row2 = vcombine_u8(p5p1, q2q6); + row3 = vcombine_u8(p4p0, q3qy); + + store_u8_8x16(src - 8, stride, row0, row1, row2, row3); } -#if HAVE_NEON_ASM -void aom_lpf_horizontal_8_dual_neon( - uint8_t *s, int p /* pitch */, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, - const uint8_t *limit1, const uint8_t *thresh1) { - aom_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1); +void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p2q2_p1q1, p3q3_p0q0; + uint32x2_t pq_rev; + uint8x8_t p3q0, p2q1, p1q2, p0q3; + uint8x8_t p0q0, p1q1, p2q2, p3q3; + + // row0: p3 p2 p1 p0 | q0 q1 q2 q3 + // row1: p3 p2 p1 p0 | q0 q1 q2 q3 + // row2: p3 p2 p1 p0 | q0 q1 q2 q3 + // row3: p3 p2 p1 p0 | q0 q1 q2 q3 + load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3); + + transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3)); + p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); + + p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); + p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); + + lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); + p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); + + p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1])); + p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]); + transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3); + + store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3); } -void aom_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - aom_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0); - aom_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1); +void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2, p3q3; + + p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride))); + p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); + p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), + vreinterpret_u32_u8(p0q0), 1)); + p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), + vreinterpret_u32_u8(p1q1), 1)); + p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), + vreinterpret_u32_u8(p2q2), 1)); + p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride), + vreinterpret_u32_u8(p3q3), 1)); + + lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0); + vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); + vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); + vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); + vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); + vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); + vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); + vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1); } -void aom_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - aom_lpf_vertical_16_neon(s, p, blimit, limit, thresh); - aom_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); +void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2; + + p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride))); + p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride))); + p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride), + vreinterpret_u32_u8(p0q0), 1)); + p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride), + vreinterpret_u32_u8(p1q1), 1)); + p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride), + vreinterpret_u32_u8(p2q2), 1)); + + lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0); + vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0); + vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0); + vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1); + vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); + vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); } -#endif // HAVE_NEON_ASM diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c index a1eeaf4b7..606950ab2 100644 --- a/third_party/aom/aom_dsp/arm/sad4d_neon.c +++ b/third_party/aom/aom_dsp/arm/sad4d_neon.c @@ -11,8 +11,9 @@ #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c index 2f452f55b..a39de91d6 100644 --- a/third_party/aom/aom_dsp/arm/sad_neon.c +++ b/third_party/aom/aom_dsp/arm/sad_neon.c @@ -11,7 +11,7 @@ #include -#include "./aom_config.h" +#include "config/aom_config.h" #include "aom/aom_integer.h" diff --git a/third_party/aom/aom_dsp/arm/save_reg_neon.asm b/third_party/aom/aom_dsp/arm/save_reg_neon.asm deleted file mode 100644 index e04969823..000000000 --- a/third_party/aom/aom_dsp/arm/save_reg_neon.asm +++ /dev/null @@ -1,39 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - - EXPORT |aom_push_neon| - EXPORT |aom_pop_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -|aom_push_neon| PROC - vst1.i64 {d8, d9, d10, d11}, [r0]! - vst1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - -|aom_pop_neon| PROC - vld1.i64 {d8, d9, d10, d11}, [r0]! - vld1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - - END - diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c index 064b72d6f..44d821821 100644 --- a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c +++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c @@ -10,8 +10,9 @@ */ #include -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c index cb8a2daf8..28f5ace8e 100644 --- a/third_party/aom/aom_dsp/arm/subtract_neon.c +++ b/third_party/aom/aom_dsp/arm/subtract_neon.c @@ -11,7 +11,8 @@ #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" void aom_subtract_block_neon(int rows, int cols, int16_t *diff, diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c index dbab287e3..74385a601 100644 --- a/third_party/aom/aom_dsp/arm/variance_neon.c +++ b/third_party/aom/aom_dsp/arm/variance_neon.c @@ -11,8 +11,8 @@ #include -#include "./aom_dsp_rtcd.h" -#include "./aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/avg.c b/third_party/aom/aom_dsp/avg.c deleted file mode 100644 index f732224fd..000000000 --- a/third_party/aom/aom_dsp/avg.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_ports/mem.h" - -// src_diff: first pass, 9 bit, dynamic range [-255, 255] -// second pass, 12 bit, dynamic range [-2040, 2040] -static void hadamard_col8(const int16_t *src_diff, int src_stride, - int16_t *coeff) { - int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; - int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; - int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; - int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; - int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; - int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; - int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; - int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; - - int16_t c0 = b0 + b2; - int16_t c1 = b1 + b3; - int16_t c2 = b0 - b2; - int16_t c3 = b1 - b3; - int16_t c4 = b4 + b6; - int16_t c5 = b5 + b7; - int16_t c6 = b4 - b6; - int16_t c7 = b5 - b7; - - coeff[0] = c0 + c4; - coeff[7] = c1 + c5; - coeff[3] = c2 + c6; - coeff[4] = c3 + c7; - coeff[2] = c0 - c4; - coeff[6] = c1 - c5; - coeff[1] = c2 - c6; - coeff[5] = c3 - c7; -} - -// The order of the output coeff of the hadamard is not important. For -// optimization purposes the final transpose may be skipped. -void aom_hadamard_8x8_c(const int16_t *src_diff, int src_stride, - int16_t *coeff) { - int idx; - int16_t buffer[64]; - int16_t *tmp_buf = &buffer[0]; - for (idx = 0; idx < 8; ++idx) { - hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit - // dynamic range [-255, 255] - tmp_buf += 8; - ++src_diff; - } - - tmp_buf = &buffer[0]; - for (idx = 0; idx < 8; ++idx) { - hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit - // dynamic range [-2040, 2040] - coeff += 8; // coeff: 15 bit - // dynamic range [-16320, 16320] - ++tmp_buf; - } -} - -// In place 16x16 2D Hadamard transform -void aom_hadamard_16x16_c(const int16_t *src_diff, int src_stride, - int16_t *coeff) { - int idx; - for (idx = 0; idx < 4; ++idx) { - // src_diff: 9 bit, dynamic range [-255, 255] - const int16_t *src_ptr = - src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; - aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); - } - - // coeff: 15 bit, dynamic range [-16320, 16320] - for (idx = 0; idx < 64; ++idx) { - int16_t a0 = coeff[0]; - int16_t a1 = coeff[64]; - int16_t a2 = coeff[128]; - int16_t a3 = coeff[192]; - - int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] - int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range - int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] - int16_t b3 = (a2 - a3) >> 1; - - coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] - coeff[64] = b1 + b3; - coeff[128] = b0 - b2; - coeff[192] = b1 - b3; - - ++coeff; - } -} - -// coeff: 16 bits, dynamic range [-32640, 32640]. -// length: value range {16, 64, 256, 1024}. -int aom_satd_c(const int16_t *coeff, int length) { - int i; - int satd = 0; - for (i = 0; i < length; ++i) satd += abs(coeff[i]); - - // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - return satd; -} - -// Integer projection onto row vectors. -// height: value range {16, 32, 64}. -void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, int ref_stride, - int height) { - int idx; - const int norm_factor = height >> 1; - for (idx = 0; idx < 16; ++idx) { - int i; - hbuf[idx] = 0; - // hbuf[idx]: 14 bit, dynamic range [0, 16320]. - for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; - // hbuf[idx]: 9 bit, dynamic range [0, 510]. - hbuf[idx] /= norm_factor; - ++ref; - } -} - -// width: value range {16, 32, 64}. -int16_t aom_int_pro_col_c(const uint8_t *ref, int width) { - int idx; - int16_t sum = 0; - // sum: 14 bit, dynamic range [0, 16320] - for (idx = 0; idx < width; ++idx) sum += ref[idx]; - return sum; -} - -// ref: [0 - 510] -// src: [0 - 510] -// bwl: {2, 3, 4} -int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) { - int i; - int width = 4 << bwl; - int sse = 0, mean = 0, var; - - for (i = 0; i < width; ++i) { - int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits. - mean += diff; // mean: dynamic range 16 bits. - sse += diff * diff; // sse: dynamic range 26 bits. - } - - // (mean * mean): dynamic range 31 bits. - var = sse - ((mean * mean) >> (bwl + 2)); - return var; -} - -void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref, - int ref_stride, int *min, int *max) { - int i, j; - *min = 255; - *max = 0; - for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) { - for (j = 0; j < 8; ++j) { - int diff = abs(src[j] - ref[j]); - *min = diff < *min ? diff : *min; - *max = diff > *max ? diff : *max; - } - } -} - -#if CONFIG_HIGHBITDEPTH -void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, - int dp, int *min, int *max) { - int i, j; - const uint16_t *s = CONVERT_TO_SHORTPTR(s8); - const uint16_t *d = CONVERT_TO_SHORTPTR(d8); - *min = 255; - *max = 0; - for (i = 0; i < 8; ++i, s += p, d += dp) { - for (j = 0; j < 8; ++j) { - int diff = abs(s[j] - d[j]); - *min = diff < *min ? diff : *min; - *max = diff > *max ? diff : *max; - } - } -} -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c index 4f38afbc5..d05c3efdc 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.c +++ b/third_party/aom/aom_dsp/binary_codes_reader.c @@ -33,17 +33,6 @@ static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { } } -int16_t aom_read_primitive_symmetric_(aom_reader *r, - unsigned int mag_bits ACCT_STR_PARAM) { - if (aom_read_bit(r, ACCT_STR_NAME)) { - int s = aom_read_bit(r, ACCT_STR_NAME); - int16_t x = aom_read_literal(r, mag_bits, ACCT_STR_NAME) + 1; - return (s > 0 ? -x : x); - } else { - return 0; - } -} - uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM) { if (n <= 1) return 0; @@ -62,76 +51,56 @@ static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb, return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb); } -uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p, - uint16_t ref ACCT_STR_PARAM) { - if (n <= 1) return 0; - assert(p > 0 && p <= n); - assert(ref < n); - int lolimit = ref - p / 2; - const int hilimit = lolimit + p - 1; - if (lolimit < 0) { - lolimit = 0; - } else if (hilimit >= n) { - lolimit = n - p; - } - int v; - if (aom_read_bit(r, ACCT_STR_NAME)) { - v = aom_read_primitive_quniform(r, p, ACCT_STR_NAME) + lolimit; - } else { - v = aom_read_primitive_quniform(r, n - p, ACCT_STR_NAME); - if (v >= lolimit) v += p; - } - return v; -} - // Decode finite subexponential code that for a symbol v in [0, n-1] with // parameter k uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, uint16_t k ACCT_STR_PARAM) { int i = 0; int mk = 0; - uint16_t v; + while (1) { int b = (i ? k + i - 1 : k); int a = (1 << b); + if (n <= mk + 3 * a) { - v = aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk; - break; - } else { - if (aom_read_bit(r, ACCT_STR_NAME)) { - i = i + 1; - mk += a; - } else { - v = aom_read_literal(r, b, ACCT_STR_NAME) + mk; - break; - } + return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk; + } + + if (!aom_read_bit(r, ACCT_STR_NAME)) { + return aom_read_literal(r, b, ACCT_STR_NAME) + mk; } + + i = i + 1; + mk += a; } - return v; + + assert(0); + return 0; } static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k) { int i = 0; int mk = 0; - uint16_t v; + while (1) { int b = (i ? k + i - 1 : k); int a = (1 << b); + if (n <= mk + 3 * a) { - v = aom_rb_read_primitive_quniform(rb, n - mk) + mk; - break; - } else { - if (aom_rb_read_bit(rb)) { - i = i + 1; - mk += a; - } else { - v = aom_rb_read_literal(rb, b) + mk; - break; - } + return aom_rb_read_primitive_quniform(rb, n - mk) + mk; } + + if (!aom_rb_read_bit(rb)) { + return aom_rb_read_literal(rb, b) + mk; + } + + i = i + 1; + mk += a; } - return v; + + assert(0); + return 0; } uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, @@ -146,20 +115,19 @@ static uint16_t aom_rb_read_primitive_refsubexpfin( aom_rb_read_primitive_subexpfin(rb, n, k)); } -// Decode finite subexponential code that for a symbol v in [-(n-1), n-1] with -// parameter k based on a reference ref also in [-(n-1), n-1]. -int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n, - uint16_t k, - int16_t ref ACCT_STR_PARAM) { - ref += n - 1; - const uint16_t scaled_n = (n << 1) - 1; - return aom_read_primitive_refsubexpfin(r, scaled_n, k, ref, ACCT_STR_NAME) - - n + 1; -} - int16_t aom_rb_read_signed_primitive_refsubexpfin( struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) { ref += n - 1; const uint16_t scaled_n = (n << 1) - 1; return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1; } + +uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) { + int leading_zeros = 0; + while (!aom_rb_read_bit(rb)) ++leading_zeros; + // Maximum 32 bits. + if (leading_zeros >= 32) return UINT32_MAX; + const uint32_t base = (1u << leading_zeros) - 1; + const uint32_t value = aom_rb_read_literal(rb, leading_zeros); + return base + value; +} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h index 8885142c9..5253c6154 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.h +++ b/third_party/aom/aom_dsp/binary_codes_reader.h @@ -18,40 +18,30 @@ extern "C" { #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #include "aom_dsp/bitreader.h" #include "aom_dsp/bitreader_buffer.h" -#define aom_read_primitive_symmetric(r, n, ACCT_STR_NAME) \ - aom_read_primitive_symmetric_(r, n ACCT_STR_ARG(ACCT_STR_NAME)) #define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \ aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME)) -#define aom_read_primitive_refbilevel(r, n, p, ref, ACCT_STR_NAME) \ - aom_read_primitive_refbilevel_(r, n, p, ref ACCT_STR_ARG(ACCT_STR_NAME)) #define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \ aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME)) #define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \ aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME)) -#define aom_read_signed_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \ - aom_read_signed_primitive_refsubexpfin_(r, n, k, \ - ref ACCT_STR_ARG(ACCT_STR_NAME)) -int16_t aom_read_primitive_symmetric_(aom_reader *r, - unsigned int mag_bits ACCT_STR_PARAM); uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM); -uint16_t aom_read_primitive_refbilevel_(aom_reader *r, uint16_t n, uint16_t p, - uint16_t ref ACCT_STR_PARAM); uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n, uint16_t k ACCT_STR_PARAM); uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, uint16_t ref ACCT_STR_PARAM); -int16_t aom_read_signed_primitive_refsubexpfin_(aom_reader *r, uint16_t n, - uint16_t k, - int16_t ref ACCT_STR_PARAM); int16_t aom_rb_read_signed_primitive_refsubexpfin( struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref); + +uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb); + #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c index e092b6278..8f74f0942 100644 --- a/third_party/aom/aom_dsp/binary_codes_writer.c +++ b/third_party/aom/aom_dsp/binary_codes_writer.c @@ -89,61 +89,6 @@ int aom_count_primitive_quniform(uint16_t n, uint16_t v) { return v < m ? l - 1 : l; } -// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1] -// The closest p values of v from ref are coded using a p-ary quasi-unoform -// short code while the remaining n-p values are coded with a longer code. -void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p, - uint16_t ref, uint16_t v) { - if (n <= 1) return; - assert(p > 0 && p <= n); - assert(ref < n); - int lolimit = ref - p / 2; - int hilimit = lolimit + p - 1; - if (lolimit < 0) { - lolimit = 0; - hilimit = p - 1; - } else if (hilimit >= n) { - hilimit = n - 1; - lolimit = n - p; - } - if (v >= lolimit && v <= hilimit) { - aom_write_bit(w, 1); - v = v - lolimit; - aom_write_primitive_quniform(w, p, v); - } else { - aom_write_bit(w, 0); - if (v > hilimit) v -= p; - aom_write_primitive_quniform(w, n - p, v); - } -} - -int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref, - uint16_t v) { - if (n <= 1) return 0; - assert(p > 0 && p <= n); - assert(ref < n); - int lolimit = ref - p / 2; - int hilimit = lolimit + p - 1; - if (lolimit < 0) { - lolimit = 0; - hilimit = p - 1; - } else if (hilimit >= n) { - hilimit = n - 1; - lolimit = n - p; - } - int count = 0; - if (v >= lolimit && v <= hilimit) { - count++; - v = v - lolimit; - count += aom_count_primitive_quniform(p, v); - } else { - count++; - if (v > hilimit) v -= p; - count += aom_count_primitive_quniform(n - p, v); - } - return count; -} - // Finite subexponential code that codes a symbol v in [0, n-1] with parameter k void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, uint16_t v) { @@ -263,3 +208,15 @@ int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, const uint16_t scaled_n = (n << 1) - 1; return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v); } + +void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) { + int64_t shift_val = ++v; + int leading_zeroes = 1; + + assert(shift_val > 0); + + while (shift_val >>= 1) leading_zeroes += 2; + + aom_wb_write_literal(wb, 0, leading_zeroes >> 1); + aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1); +} diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h index 18ad5078f..784c721a6 100644 --- a/third_party/aom/aom_dsp/binary_codes_writer.h +++ b/third_party/aom/aom_dsp/binary_codes_writer.h @@ -17,7 +17,8 @@ extern "C" { #endif #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #include "aom_dsp/bitwriter.h" #include "aom_dsp/bitwriter_buffer.h" @@ -33,12 +34,6 @@ void aom_write_primitive_symmetric(aom_writer *w, int16_t v, // Encodes a value v in [0, n-1] quasi-uniformly void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v); -// Encodes a value v in [0, n-1] based on a reference ref also in [0, n-1] -// The closest p values of v from ref are coded using a p-ary quasi-unoform -// short code while the remaining n-p values are coded with a longer code. -void aom_write_primitive_refbilevel(aom_writer *w, uint16_t n, uint16_t p, - uint16_t ref, uint16_t v); - // Finite subexponential code that codes a symbol v in [0, n-1] with parameter k void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k, uint16_t v); @@ -61,13 +56,12 @@ void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb, // Functions that counts bits for the above primitives int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits); int aom_count_primitive_quniform(uint16_t n, uint16_t v); -int aom_count_primitive_refbilevel(uint16_t n, uint16_t p, uint16_t ref, - uint16_t v); int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v); int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, uint16_t v); int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, int16_t v); +void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v); #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h index 00424fa76..328935be9 100644 --- a/third_party/aom/aom_dsp/bitreader.h +++ b/third_party/aom/aom_dsp/bitreader.h @@ -15,15 +15,11 @@ #include #include -#include "./aom_config.h" +#include "config/aom_config.h" #include "aom/aomdx.h" #include "aom/aom_integer.h" -#if CONFIG_ANS -#include "aom_dsp/ansreader.h" -#else #include "aom_dsp/daalaboolreader.h" -#endif #include "aom_dsp/prob.h" #include "av1/common/odintrin.h" @@ -50,72 +46,37 @@ #define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \ aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) -#if CONFIG_LV_MAP -#define aom_read_bin(r, cdf, nsymbs, ACCT_STR_NAME) \ - aom_read_bin_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME)) -#endif - #ifdef __cplusplus extern "C" { #endif -#if CONFIG_ANS -typedef struct AnsDecoder aom_reader; -#else typedef struct daala_reader aom_reader; -#endif static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer, - size_t size, aom_decrypt_cb decrypt_cb, - void *decrypt_state) { - (void)decrypt_cb; - (void)decrypt_state; -#if CONFIG_ANS - if (size > INT_MAX) return 1; - return ans_read_init(r, buffer, (int)size); -#else + size_t size) { return aom_daala_reader_init(r, buffer, (int)size); -#endif +} + +static INLINE const uint8_t *aom_reader_find_begin(aom_reader *r) { + return aom_daala_reader_find_begin(r); } static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) { -#if CONFIG_ANS - (void)r; - assert(0 && "Use the raw buffer size with ANS"); - return NULL; -#else return aom_daala_reader_find_end(r); -#endif } static INLINE int aom_reader_has_error(aom_reader *r) { -#if CONFIG_ANS - return ans_reader_has_error(r); -#else return aom_daala_reader_has_error(r); -#endif } // Returns the position in the bit reader in bits. static INLINE uint32_t aom_reader_tell(const aom_reader *r) { -#if CONFIG_ANS - (void)r; - assert(0 && "aom_reader_tell() is unimplemented for ANS"); - return 0; -#else return aom_daala_reader_tell(r); -#endif } // Returns the position in the bit reader in 1/8th bits. static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) { -#if CONFIG_ANS - (void)r; - assert(0 && "aom_reader_tell_frac() is unimplemented for ANS"); - return 0; -#else return aom_daala_reader_tell_frac(r); -#endif } #if CONFIG_ACCOUNTING @@ -139,11 +100,7 @@ static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) { static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) { int ret; -#if CONFIG_ANS - ret = rabs_read(r, prob); -#else ret = aom_daala_read(r, prob); -#endif #if CONFIG_ACCOUNTING if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); aom_update_symb_counts(r, 1); @@ -153,15 +110,7 @@ static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) { static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) { int ret; -#if CONFIG_ANS - ret = rabs_read_bit(r); // Non trivial optimization at half probability -#elif CONFIG_RAWBITS - // Note this uses raw bits and is not the same as aom_daala_read(r, 128); - // Calls to this function are omitted from raw symbol accounting. - ret = aom_daala_read_bit(r); -#else ret = aom_read(r, 128, NULL); // aom_prob_half -#endif #if CONFIG_ACCOUNTING if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); #endif @@ -181,12 +130,7 @@ static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) { static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf, int nsymbs ACCT_STR_PARAM) { int ret; -#if CONFIG_ANS - (void)nsymbs; - ret = rans_read(r, cdf); -#else ret = daala_read_symbol(r, cdf, nsymbs); -#endif #if CONFIG_ACCOUNTING if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); @@ -199,46 +143,7 @@ static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, int nsymbs ACCT_STR_PARAM) { int ret; ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); - update_cdf(cdf, ret, nsymbs); - return ret; -} - -#if CONFIG_LV_MAP -static INLINE int aom_read_bin_(aom_reader *r, aom_cdf_prob *cdf, - int nsymbs ACCT_STR_PARAM) { - int ret; - ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME); - update_cdf(cdf, ret, nsymbs); - return ret; -} -#endif - -static INLINE int aom_read_tree_as_cdf(aom_reader *r, - const aom_tree_index *tree, - const aom_prob *probs) { - aom_tree_index i = 0; - do { - aom_cdf_prob cdf[16]; - aom_tree_index index[16]; - int path[16]; - int dist[16]; - int nsymbs; - int symb; - nsymbs = tree_to_cdf(tree, probs, i, cdf, index, path, dist); - symb = aom_read_cdf(r, cdf, nsymbs, NULL); - OD_ASSERT(symb >= 0 && symb < nsymbs); - i = index[symb]; - } while (i > 0); - return -i; -} - -static INLINE int aom_read_tree_(aom_reader *r, const aom_tree_index *tree, - const aom_prob *probs ACCT_STR_PARAM) { - int ret; - ret = aom_read_tree_as_cdf(r, tree, probs); -#if CONFIG_ACCOUNTING - if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME); -#endif + if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs); return ret; } diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c index e51b1cc3a..68fc381f2 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.c +++ b/third_party/aom/aom_dsp/bitreader_buffer.c @@ -8,8 +8,9 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_config.h" -#include "./bitreader_buffer.h" +#include "config/aom_config.h" + +#include "aom_dsp/bitreader_buffer.h" size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) { return (rb->bit_offset + 7) >> 3; @@ -35,9 +36,13 @@ int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { return value; } -int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits) { - const int value = aom_rb_read_literal(rb, bits); - return aom_rb_read_bit(rb) ? -value : value; +uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, + int bits) { + uint32_t value = 0; + int bit; + for (bit = bits - 1; bit >= 0; bit--) + value |= (uint32_t)aom_rb_read_bit(rb) << bit; + return value; } int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) { diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h index 22187357e..2dafe11ad 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.h +++ b/third_party/aom/aom_dsp/bitreader_buffer.h @@ -37,7 +37,7 @@ int aom_rb_read_bit(struct aom_read_bit_buffer *rb); int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits); -int aom_rb_read_signed_literal(struct aom_read_bit_buffer *rb, int bits); +uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits); int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits); diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h index 7d3b34306..de1b1d048 100644 --- a/third_party/aom/aom_dsp/bitwriter.h +++ b/third_party/aom/aom_dsp/bitwriter.h @@ -13,13 +13,10 @@ #define AOM_DSP_BITWRITER_H_ #include -#include "./aom_config.h" -#if CONFIG_ANS -#include "aom_dsp/buf_ans.h" -#else +#include "config/aom_config.h" + #include "aom_dsp/daalaboolwriter.h" -#endif #include "aom_dsp/prob.h" #if CONFIG_RD_DEBUG @@ -31,23 +28,16 @@ extern "C" { #endif -#if CONFIG_ANS -typedef struct BufAnsCoder aom_writer; -#else typedef struct daala_writer aom_writer; -#endif typedef struct TOKEN_STATS { int cost; -#if CONFIG_VAR_TX #if CONFIG_RD_DEBUG int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE]; #endif -#endif } TOKEN_STATS; static INLINE void init_token_stats(TOKEN_STATS *token_stats) { -#if CONFIG_VAR_TX #if CONFIG_RD_DEBUG int r, c; for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) { @@ -55,66 +45,24 @@ static INLINE void init_token_stats(TOKEN_STATS *token_stats) { token_stats->txb_coeff_cost_map[r][c] = 0; } } -#endif #endif token_stats->cost = 0; } static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) { -#if CONFIG_ANS - aom_buf_ans_alloc(bc, /* error context*/ NULL); - buf_ans_write_init(bc, buffer); -#else aom_daala_start_encode(bc, buffer); -#endif } -static INLINE void aom_stop_encode(aom_writer *bc) { -#if CONFIG_ANS - aom_buf_ans_flush(bc); - bc->pos = buf_ans_write_end(bc); -#else - aom_daala_stop_encode(bc); -#endif +static INLINE int aom_stop_encode(aom_writer *bc) { + return aom_daala_stop_encode(bc); } static INLINE void aom_write(aom_writer *br, int bit, int probability) { -#if CONFIG_ANS - buf_rabs_write(br, bit, probability); -#else aom_daala_write(br, bit, probability); -#endif -} - -static INLINE void aom_write_record(aom_writer *br, int bit, int probability, - TOKEN_STATS *token_stats) { - aom_write(br, bit, probability); -#if CONFIG_RD_DEBUG - token_stats->cost += av1_cost_bit(probability, bit); -#else - (void)token_stats; -#endif } static INLINE void aom_write_bit(aom_writer *w, int bit) { -#if CONFIG_ANS - buf_rabs_write_bit(w, bit); -#elif CONFIG_RAWBITS - // Note this uses raw bits and is not the same as aom_daala_write(r, 128); - aom_daala_write_bit(w, bit); -#else aom_write(w, bit, 128); // aom_prob_half -#endif -} - -static INLINE void aom_write_bit_record(aom_writer *w, int bit, - TOKEN_STATS *token_stats) { - aom_write_bit(w, bit); -#if CONFIG_RD_DEBUG - token_stats->cost += av1_cost_bit(128, bit); // aom_prob_half -#else - (void)token_stats; -#endif } static INLINE void aom_write_literal(aom_writer *w, int data, int bits) { @@ -125,83 +73,13 @@ static INLINE void aom_write_literal(aom_writer *w, int data, int bits) { static INLINE void aom_write_cdf(aom_writer *w, int symb, const aom_cdf_prob *cdf, int nsymbs) { -#if CONFIG_ANS - (void)nsymbs; - assert(cdf); - const aom_cdf_prob cum_prob = symb > 0 ? cdf[symb - 1] : 0; - const aom_cdf_prob prob = cdf[symb] - cum_prob; - buf_rans_write(w, cum_prob, prob); -#else daala_write_symbol(w, symb, cdf, nsymbs); -#endif } static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, int nsymbs) { aom_write_cdf(w, symb, cdf, nsymbs); - update_cdf(cdf, symb, nsymbs); -} - -#if CONFIG_LV_MAP -static INLINE void aom_write_bin(aom_writer *w, int symb, aom_cdf_prob *cdf, - int nsymbs) { - aom_write_cdf(w, symb, cdf, nsymbs); - update_cdf(cdf, symb, nsymbs); -} -#endif - -static INLINE void aom_write_tree_as_cdf(aom_writer *w, - const aom_tree_index *tree, - const aom_prob *probs, int bits, - int len, aom_tree_index i) { - aom_tree_index root; - root = i; - do { - aom_cdf_prob cdf[16]; - aom_tree_index index[16]; - int path[16]; - int dist[16]; - int nsymbs; - int symb; - int j; - /* Compute the CDF of the binary tree using the given probabilities. */ - nsymbs = tree_to_cdf(tree, probs, root, cdf, index, path, dist); - /* Find the symbol to code. */ - symb = -1; - for (j = 0; j < nsymbs; j++) { - /* If this symbol codes a leaf node, */ - if (index[j] <= 0) { - if (len == dist[j] && path[j] == bits) { - symb = j; - break; - } - } else { - if (len > dist[j] && path[j] == bits >> (len - dist[j])) { - symb = j; - break; - } - } - } - OD_ASSERT(symb != -1); - aom_write_cdf(w, symb, cdf, nsymbs); - bits &= (1 << (len - dist[symb])) - 1; - len -= dist[symb]; - } while (len); -} - -static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree, - const aom_prob *probs, int bits, int len, - aom_tree_index i) { - aom_write_tree_as_cdf(w, tree, probs, bits, len, i); -} - -static INLINE void aom_write_tree_record(aom_writer *w, - const aom_tree_index *tree, - const aom_prob *probs, int bits, - int len, aom_tree_index i, - TOKEN_STATS *token_stats) { - (void)token_stats; - aom_write_tree_as_cdf(w, tree, probs, bits, len, i); + if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs); } #ifdef __cplusplus diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c index 1b3dd2913..21314eb2a 100644 --- a/third_party/aom/aom_dsp/bitwriter_buffer.c +++ b/third_party/aom/aom_dsp/bitwriter_buffer.c @@ -12,8 +12,13 @@ #include #include -#include "./aom_config.h" -#include "./bitwriter_buffer.h" +#include "config/aom_config.h" + +#include "aom_dsp/bitwriter_buffer.h" + +int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) { + return (wb->bit_offset % CHAR_BIT == 0); +} uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) { return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0); @@ -48,6 +53,12 @@ void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) { for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); } +void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, + uint32_t data, int bits) { + int bit; + for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); +} + void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, int bits) { int bit; diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h index 1f23dc857..f7f75a097 100644 --- a/third_party/aom/aom_dsp/bitwriter_buffer.h +++ b/third_party/aom/aom_dsp/bitwriter_buffer.h @@ -23,6 +23,8 @@ struct aom_write_bit_buffer { uint32_t bit_offset; }; +int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb); + uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb); void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit); @@ -31,6 +33,9 @@ void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit); void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits); +void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, + uint32_t data, int bits); + void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, int bits); diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h index e5297ff83..434bb83a1 100644 --- a/third_party/aom/aom_dsp/blend.h +++ b/third_party/aom/aom_dsp/blend.h @@ -39,4 +39,7 @@ // Blending by averaging. #define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1) +#define DIFF_FACTOR_LOG2 4 +#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2) + #endif // AOM_DSP_BLEND_H_ diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c index 99b4b8a59..0554b43d1 100644 --- a/third_party/aom/aom_dsp/blend_a64_hmask.c +++ b/third_party/aom/aom_dsp/blend_a64_hmask.c @@ -16,12 +16,12 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/blend.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { int i, j; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); @@ -40,11 +40,10 @@ void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride, } } -#if CONFIG_HIGHBITDEPTH void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int h, int w, int bd) { + const uint8_t *mask, int w, int h, int bd) { int i, j; uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); @@ -68,4 +67,3 @@ void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride, } } } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c index c35fa19f8..992cc5c0c 100644 --- a/third_party/aom/aom_dsp/blend_a64_mask.c +++ b/third_party/aom/aom_dsp/blend_a64_mask.c @@ -16,70 +16,209 @@ #include "aom_dsp/blend.h" #include "aom_dsp/aom_dsp_common.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" -#if CONFIG_CONVOLVE_ROUND // Blending with alpha mask. Mask values come from the range [0, 64], // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can // be the same as dst, or dst can be different from both sources. -void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride, - const int32_t *src0, uint32_t src0_stride, - const int32_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - int w, int subh, int subw) { +// NOTE(david.barker): The input and output of aom_blend_a64_d32_mask_c() are +// in a higher intermediate precision, and will later be rounded down to pixel +// precision. +// Thus, in order to avoid double-rounding, we want to use normal right shifts +// within this function, not ROUND_POWER_OF_TWO. +// This works because of the identity: +// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z) +// +// In contrast, the output of the non-d32 functions will not be further rounded, +// so we *should* use ROUND_POWER_OF_TWO there. + +void aom_lowbd_blend_a64_d16_mask_c( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { int i, j; + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); - assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); - assert(h >= 1); - assert(w >= 1); + assert(h >= 4); + assert(w >= 4); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); if (subw == 0 && subh == 0) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { + int32_t res; const int m = mask[i * mask_stride + j]; - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); + res = ((m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); } } } else if (subw == 1 && subh == 1) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { + int32_t res; const int m = ROUND_POWER_OF_TWO( mask[(2 * i) * mask_stride + (2 * j)] + mask[(2 * i + 1) * mask_stride + (2 * j)] + mask[(2 * i) * mask_stride + (2 * j + 1)] + mask[(2 * i + 1) * mask_stride + (2 * j + 1)], 2); - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); + res = ((m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); } } } else if (subw == 1 && subh == 0) { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { + int32_t res; const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)], mask[i * mask_stride + (2 * j + 1)]); - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); + res = ((m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); } } } else { for (i = 0; i < h; ++i) { for (j = 0; j < w; ++j) { + int32_t res; const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j], mask[(2 * i + 1) * mask_stride + j]); - dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], - src1[i * src1_stride + j]); + res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] + + (AOM_BLEND_A64_MAX_ALPHA - m) * + (int32_t)src1[i * src1_stride + j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + dst[i * dst_stride + j] = + clip_pixel(ROUND_POWER_OF_TWO(res, round_bits)); + } + } + } +} + +void aom_highbd_blend_a64_d16_mask_c( + uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params, const int bd) { + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); + + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + // excerpt from clip_pixel_highbd() + // set saturation_value to (1 << bd) - 1 + unsigned int saturation_value; + switch (bd) { + case 8: + default: saturation_value = 255; break; + case 10: saturation_value = 1023; break; + case 12: saturation_value = 4095; break; + } + + if (subw == 0 && subh == 0) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = mask[j]; + res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS); + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else if (subw == 1 && subh == 1) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = ROUND_POWER_OF_TWO( + mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] + + mask[mask_stride + 2 * j + 1], + 2); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else if (subw == 1 && subh == 0) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); + } + mask += mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int32_t res; + const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]); + res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >> + AOM_BLEND_A64_ROUND_BITS; + res -= round_offset; + unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits)); + dst[j] = AOMMIN(v, saturation_value); } + mask += 2 * mask_stride; + src0 += src0_stride; + src1 += src1_stride; + dst += dst_stride; } } } -#endif // CONFIG_CONVOLVE_ROUND // Blending with alpha mask. Mask values come from the range [0, 64], // as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can @@ -88,8 +227,8 @@ void aom_blend_a64_d32_mask_c(int32_t *dst, uint32_t dst_stride, void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - int w, int subh, int subw) { + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subw, int subh) { int i, j; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); @@ -142,12 +281,11 @@ void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride, } } -#if CONFIG_HIGHBITDEPTH void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, - int h, int w, int subh, int subw, int bd) { + int w, int h, int subw, int subh, int bd) { int i, j; uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); @@ -205,4 +343,3 @@ void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride, } } } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c index 1a5e30e31..4f222e17f 100644 --- a/third_party/aom/aom_dsp/blend_a64_vmask.c +++ b/third_party/aom/aom_dsp/blend_a64_vmask.c @@ -16,12 +16,12 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/blend.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { int i, j; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); @@ -41,11 +41,10 @@ void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride, } } -#if CONFIG_HIGHBITDEPTH void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int h, int w, int bd) { + const uint8_t *mask, int w, int h, int bd) { int i, j; uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); @@ -70,4 +69,3 @@ void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride, } } } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h index f84ff3aed..cf7df1dbf 100644 --- a/third_party/aom/aom_dsp/buf_ans.h +++ b/third_party/aom/aom_dsp/buf_ans.h @@ -16,7 +16,8 @@ // backwards due to ANS's stack like behavior. #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #include "aom_dsp/ans.h" #include "aom_dsp/answriter.h" @@ -47,6 +48,7 @@ struct BufAnsCoder { int window_size; #endif int pos; // Dummy variable to store the output buffer after closing + uint8_t allow_update_cdf; }; // Allocate a buffered ANS coder to store size symbols. diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c index c6e3ac82d..4e224904e 100644 --- a/third_party/aom/aom_dsp/daalaboolreader.c +++ b/third_party/aom/aom_dsp/daalaboolreader.c @@ -24,6 +24,10 @@ int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) { return 0; } +const uint8_t *aom_daala_reader_find_begin(daala_reader *r) { + return r->buffer; +} + const uint8_t *aom_daala_reader_find_end(daala_reader *r) { return r->buffer_end; } diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h index 55ff8d3d5..60c197a49 100644 --- a/third_party/aom/aom_dsp/daalaboolreader.h +++ b/third_party/aom/aom_dsp/daalaboolreader.h @@ -34,11 +34,13 @@ struct daala_reader { #if CONFIG_ACCOUNTING Accounting *accounting; #endif + uint8_t allow_update_cdf; }; typedef struct daala_reader daala_reader; int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size); +const uint8_t *aom_daala_reader_find_begin(daala_reader *r); const uint8_t *aom_daala_reader_find_end(daala_reader *r); uint32_t aom_daala_reader_tell(const daala_reader *r); uint32_t aom_daala_reader_tell_frac(const daala_reader *r); @@ -96,12 +98,6 @@ static INLINE int aom_daala_read(daala_reader *r, int prob) { return bit; } -#if CONFIG_RAWBITS -static INLINE int aom_daala_read_bit(daala_reader *r) { - return od_ec_dec_bits(&r->ec, 1, "aom_bits"); -} -#endif - static INLINE int aom_daala_reader_has_error(daala_reader *r) { return r->ec.error; } diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c index 59af2a243..b24ffbf3f 100644 --- a/third_party/aom/aom_dsp/daalaboolwriter.c +++ b/third_party/aom/aom_dsp/daalaboolwriter.c @@ -18,11 +18,14 @@ void aom_daala_start_encode(daala_writer *br, uint8_t *source) { od_ec_enc_init(&br->ec, 62025); } -void aom_daala_stop_encode(daala_writer *br) { +int aom_daala_stop_encode(daala_writer *br) { + int nb_bits; uint32_t daala_bytes; unsigned char *daala_data; daala_data = od_ec_enc_done(&br->ec, &daala_bytes); + nb_bits = od_ec_enc_tell(&br->ec); memcpy(br->buffer, daala_data, daala_bytes); br->pos = daala_bytes; od_ec_enc_clear(&br->ec); + return nb_bits; } diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h index 6ec0f0b54..f9c596c73 100644 --- a/third_party/aom/aom_dsp/daalaboolwriter.h +++ b/third_party/aom/aom_dsp/daalaboolwriter.h @@ -28,12 +28,13 @@ struct daala_writer { unsigned int pos; uint8_t *buffer; od_ec_enc ec; + uint8_t allow_update_cdf; }; typedef struct daala_writer daala_writer; void aom_daala_start_encode(daala_writer *w, uint8_t *buffer); -void aom_daala_stop_encode(daala_writer *w); +int aom_daala_stop_encode(daala_writer *w); static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) { int p = (0x7FFFFF - (prob << 15) + prob) >> 8; @@ -53,12 +54,6 @@ static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) { od_ec_encode_bool_q15(&w->ec, bit, p); } -#if CONFIG_RAWBITS -static INLINE void aom_daala_write_bit(daala_writer *w, int bit) { - od_ec_enc_bits(&w->ec, bit, 1); -} -#endif - static INLINE void daala_write_symbol(daala_writer *w, int symb, const aom_cdf_prob *cdf, int nsymbs) { #if CONFIG_BITSTREAM_DEBUG diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c index ad76b7e3e..aad96c6fc 100644 --- a/third_party/aom/aom_dsp/entcode.c +++ b/third_party/aom/aom_dsp/entcode.c @@ -9,10 +9,6 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifdef HAVE_CONFIG_H -#include "./config.h" -#endif - #include "aom_dsp/entcode.h" /*Given the current total integer number of bits used and the current value of diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h index 981a951e6..5c15526e9 100644 --- a/third_party/aom/aom_dsp/entcode.h +++ b/third_party/aom/aom_dsp/entcode.h @@ -9,11 +9,16 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#if !defined(_entcode_H) -#define _entcode_H (1) +#ifndef AOM_DSP_ENTCODE_H_ +#define AOM_DSP_ENTCODE_H_ + #include #include #include "av1/common/odintrin.h" +#include "aom_dsp/prob.h" + +#define EC_PROB_SHIFT 6 +#define EC_MIN_PROB 4 // must be <= (1< 1/8th bits.*/ #define OD_BITRES (3) -/*The value stored in an iCDF is 32768 minus the actual Q15 cumulative - probability (an "inverse" CDF). - This function converts from one representation to the other (and is its own - inverse).*/ -#define OD_ICDF(x) (32768U - (x)) +#define OD_ICDF AOM_ICDF /*See entcode.c for further documentation.*/ OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng); -#endif +#endif // AOM_DSP_ENTCODE_H_ diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c index 71dad0df6..b8e9078c3 100644 --- a/third_party/aom/aom_dsp/entdec.c +++ b/third_party/aom/aom_dsp/entdec.c @@ -9,11 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifdef HAVE_CONFIG_H -#include "./config.h" -#endif - +#include #include "aom_dsp/entdec.h" +#include "aom_dsp/prob.h" /*A range decoder. This is an entropy decoder based upon \cite{Mar79}, which is itself a @@ -75,6 +73,8 @@ Even relatively modest values like 100 would work fine.*/ #define OD_EC_LOTS_OF_BITS (0x4000) +/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill + call.*/ static void od_ec_dec_refill(od_ec_dec *dec) { int s; od_ec_window dif; @@ -87,7 +87,7 @@ static void od_ec_dec_refill(od_ec_dec *dec) { end = dec->end; s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15); for (; s >= 0 && bptr < end; s -= 8, bptr++) { - OD_ASSERT(s <= OD_EC_WINDOW_SIZE - 8); + assert(s <= OD_EC_WINDOW_SIZE - 8); dif ^= (od_ec_window)bptr[0] << s; cnt += 8; } @@ -111,7 +111,7 @@ static void od_ec_dec_refill(od_ec_dec *dec) { static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, int ret) { int d; - OD_ASSERT(rng <= 65535U); + assert(rng <= 65535U); d = 16 - OD_ILOG_NZ(rng); dec->cnt -= d; /*This is equivalent to shifting in 1's instead of 0's.*/ @@ -127,9 +127,6 @@ static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage) { dec->buf = buf; - dec->eptr = buf + storage; - dec->end_window = 0; - dec->nend_bits = 0; dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8); dec->end = buf + storage; dec->bptr = buf; @@ -150,13 +147,14 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { unsigned r_new; unsigned v; int ret; - OD_ASSERT(0 < f); - OD_ASSERT(f < 32768U); + assert(0 < f); + assert(f < 32768U); dif = dec->dif; r = dec->rng; - OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r); - OD_ASSERT(32768U <= r); - v = (r >> 8) * (uint32_t)f >> 7; + assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r); + assert(32768U <= r); + v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); + v += EC_MIN_PROB; vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); ret = 1; r_new = v; @@ -170,8 +168,8 @@ int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) { /*Decodes a symbol given an inverse cumulative distribution function (CDF) table in Q15. - icdf: 32768 minus the CDF, such that symbol s falls in the range - [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]). + icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range + [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]). The values must be monotonically non-increasing, and icdf[nsyms - 1] must be 0. nsyms: The number of symbols in the alphabet. @@ -187,62 +185,28 @@ int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) { (void)nsyms; dif = dec->dif; r = dec->rng; - OD_ASSERT(dif >> (OD_EC_WINDOW_SIZE - 16) < r); - OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U)); - OD_ASSERT(32768U <= r); + const int N = nsyms - 1; + + assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r); + assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP)); + assert(32768U <= r); + assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0); c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16)); v = r; ret = -1; do { u = v; - v = (r >> 8) * (uint32_t)icdf[++ret] >> 7; + v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >> + (7 - EC_PROB_SHIFT - CDF_SHIFT)); + v += EC_MIN_PROB * (N - ret); } while (c < v); - OD_ASSERT(v < u); - OD_ASSERT(u <= r); + assert(v < u); + assert(u <= r); r = u - v; dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16); return od_ec_dec_normalize(dec, dif, r, ret); } -#if CONFIG_RAWBITS -/*Extracts a sequence of raw bits from the stream. - The bits must have been encoded with od_ec_enc_bits(). - ftb: The number of bits to extract. - This must be between 0 and 25, inclusive. - Return: The decoded bits.*/ -uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) { - od_ec_window window; - int available; - uint32_t ret; - OD_ASSERT(ftb <= 25); - window = dec->end_window; - available = dec->nend_bits; - if ((unsigned)available < ftb) { - const unsigned char *buf; - const unsigned char *eptr; - buf = dec->buf; - eptr = dec->eptr; - OD_ASSERT(available <= OD_EC_WINDOW_SIZE - 8); - do { - if (eptr <= buf) { - dec->tell_offs += OD_EC_LOTS_OF_BITS - available; - available = OD_EC_LOTS_OF_BITS; - break; - } - window |= (od_ec_window) * --eptr << available; - available += 8; - } while (available <= OD_EC_WINDOW_SIZE - 8); - dec->eptr = eptr; - } - ret = (uint32_t)window & (((uint32_t)1 << ftb) - 1); - window >>= ftb; - available -= ftb; - dec->end_window = window; - dec->nend_bits = available; - return ret; -} -#endif - /*Returns the number of bits "used" by the decoded symbols so far. This same number can be computed in either the encoder or the decoder, and is suitable for making coding decisions. @@ -250,8 +214,7 @@ uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb) { This will always be slightly larger than the exact value (e.g., all rounding error is in the positive direction).*/ int od_ec_dec_tell(const od_ec_dec *dec) { - return (int)(((dec->end - dec->eptr) + (dec->bptr - dec->buf)) * 8 - - dec->cnt - dec->nend_bits + dec->tell_offs); + return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs); } /*Returns the number of bits "used" by the decoded symbols so far. diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h index 35ac7fe0d..e35c3f99f 100644 --- a/third_party/aom/aom_dsp/entdec.h +++ b/third_party/aom/aom_dsp/entdec.h @@ -32,16 +32,10 @@ typedef struct od_ec_dec od_ec_dec; struct od_ec_dec { /*The start of the current input buffer.*/ const unsigned char *buf; - /*The read pointer for the raw bits.*/ - const unsigned char *eptr; - /*Bits that will be read from/written at the end.*/ - od_ec_window end_window; - /*Number of valid bits in end_window.*/ - int nend_bits; /*An offset used to keep track of tell after reaching the end of the stream. This is constant throughout most of the decoding process, but becomes important once we hit the end of the buffer and stop incrementing pointers - (and instead pretend cnt/nend_bits have lots of bits).*/ + (and instead pretend cnt has lots of bits).*/ int32_t tell_offs; /*The end of the current input buffer.*/ const unsigned char *end; diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c index b8c4dc047..6866de9b9 100644 --- a/third_party/aom/aom_dsp/entenc.c +++ b/third_party/aom/aom_dsp/entenc.c @@ -9,13 +9,19 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifdef HAVE_CONFIG_H -#include "./config.h" -#endif - #include #include +#include +#include #include "aom_dsp/entenc.h" +#include "aom_dsp/prob.h" + +#if OD_MEASURE_EC_OVERHEAD +#if !defined(M_LOG2E) +#define M_LOG2E (1.4426950408889634073599246810019) +#endif +#define OD_LOG2(x) (M_LOG2E * log(x)) +#endif // OD_MEASURE_EC_OVERHEAD /*A range encoder. See entdec.c and the references for implementation details \cite{Mar79,MNW98}. @@ -53,7 +59,7 @@ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low, int c; int s; c = enc->cnt; - OD_ASSERT(rng <= 65535U); + assert(rng <= 65535U); d = 16 - OD_ILOG_NZ(rng); s = c + d; /*TODO: Right now we flush every time we have at least one byte available. @@ -83,13 +89,13 @@ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low, c += 16; m = (1 << c) - 1; if (s >= 8) { - OD_ASSERT(offs < storage); + assert(offs < storage); buf[offs++] = (uint16_t)(low >> c); low &= m; c -= 8; m >>= 8; } - OD_ASSERT(offs < storage); + assert(offs < storage); buf[offs++] = (uint16_t)(low >> c); s = c + d - 24; low &= m; @@ -120,9 +126,6 @@ void od_ec_enc_init(od_ec_enc *enc, uint32_t size) { /*Reinitializes the encoder.*/ void od_ec_enc_reset(od_ec_enc *enc) { - enc->end_offs = 0; - enc->end_window = 0; - enc->nend_bits = 0; enc->offs = 0; enc->low = 0; enc->rng = 0x8000; @@ -143,31 +146,42 @@ void od_ec_enc_clear(od_ec_enc *enc) { } /*Encodes a symbol given its frequency in Q15. - fl: 32768 minus the cumulative frequency of all symbols that come before the + fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come + before the one to be encoded. - fh: 32768 minus the cumulative frequency of all symbols up to and including + fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and + including the one to be encoded.*/ -static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh) { +static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s, + int nsyms) { od_ec_window l; unsigned r; unsigned u; unsigned v; l = enc->low; r = enc->rng; - OD_ASSERT(32768U <= r); - OD_ASSERT(fh < fl); - OD_ASSERT(fl <= 32768U); - if (fl < 32768U) { - u = (r >> 8) * (uint32_t)fl >> 7; - v = (r >> 8) * (uint32_t)fh >> 7; + assert(32768U <= r); + assert(fh <= fl); + assert(fl <= 32768U); + assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0); + const int N = nsyms - 1; + if (fl < CDF_PROB_TOP) { + u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >> + (7 - EC_PROB_SHIFT - CDF_SHIFT)) + + EC_MIN_PROB * (N - (s - 1)); + v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> + (7 - EC_PROB_SHIFT - CDF_SHIFT)) + + EC_MIN_PROB * (N - (s + 0)); l += r - u; r = u - v; } else { - r -= (r >> 8) * (uint32_t)fh >> 7; + r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> + (7 - EC_PROB_SHIFT - CDF_SHIFT)) + + EC_MIN_PROB * (N - (s + 0)); } od_ec_enc_normalize(enc, l, r); #if OD_MEASURE_EC_OVERHEAD - enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / 32768.); + enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.); enc->nb_symbols++; #endif } @@ -179,18 +193,18 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { od_ec_window l; unsigned r; unsigned v; - OD_ASSERT(0 < f); - OD_ASSERT(f < 32768U); + assert(0 < f); + assert(f < 32768U); l = enc->low; r = enc->rng; - OD_ASSERT(32768U <= r); - v = (r >> 8) * (uint32_t)f >> 7; + assert(32768U <= r); + v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)); + v += EC_MIN_PROB; if (val) l += r - v; r = val ? v : r - v; od_ec_enc_normalize(enc, l, r); #if OD_MEASURE_EC_OVERHEAD - enc->entropy -= - OD_LOG2((double)(val ? 32768 - OD_ICDF(f) : OD_ICDF(f)) / 32768.); + enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.); enc->nb_symbols++; #endif } @@ -206,67 +220,12 @@ void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) { void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf, int nsyms) { (void)nsyms; - OD_ASSERT(s >= 0); - OD_ASSERT(s < nsyms); - OD_ASSERT(icdf[nsyms - 1] == OD_ICDF(32768U)); - od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s]); + assert(s >= 0); + assert(s < nsyms); + assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP)); + od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms); } -#if CONFIG_RAWBITS -/*Encodes a sequence of raw bits in the stream. - fl: The bits to encode. - ftb: The number of bits to encode. - This must be between 0 and 25, inclusive.*/ -void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) { - od_ec_window end_window; - int nend_bits; - OD_ASSERT(ftb <= 25); - OD_ASSERT(fl < (uint32_t)1 << ftb); -#if OD_MEASURE_EC_OVERHEAD - enc->entropy += ftb; -#endif - end_window = enc->end_window; - nend_bits = enc->nend_bits; - if (nend_bits + ftb > OD_EC_WINDOW_SIZE) { - unsigned char *buf; - uint32_t storage; - uint32_t end_offs; - buf = enc->buf; - storage = enc->storage; - end_offs = enc->end_offs; - if (end_offs + (OD_EC_WINDOW_SIZE >> 3) >= storage) { - unsigned char *new_buf; - uint32_t new_storage; - new_storage = 2 * storage + (OD_EC_WINDOW_SIZE >> 3); - new_buf = (unsigned char *)malloc(sizeof(*new_buf) * new_storage); - if (new_buf == NULL) { - enc->error = -1; - enc->end_offs = 0; - return; - } - OD_COPY(new_buf + new_storage - end_offs, buf + storage - end_offs, - end_offs); - storage = new_storage; - free(buf); - enc->buf = buf = new_buf; - enc->storage = storage; - } - do { - OD_ASSERT(end_offs < storage); - buf[storage - ++end_offs] = (unsigned char)end_window; - end_window >>= 8; - nend_bits -= 8; - } while (nend_bits >= 8); - enc->end_offs = end_offs; - } - OD_ASSERT(nend_bits + ftb <= OD_EC_WINDOW_SIZE); - end_window |= (od_ec_window)fl << nend_bits; - nend_bits += ftb; - enc->end_window = end_window; - enc->nend_bits = nend_bits; -} -#endif - /*Overwrites a few bits at the very start of an existing stream, after they have already been encoded. This makes it possible to have a few flags up front, where it is easy for @@ -284,9 +243,9 @@ void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb) { void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) { int shift; unsigned mask; - OD_ASSERT(nbits >= 0); - OD_ASSERT(nbits <= 8); - OD_ASSERT(val < 1U << nbits); + assert(nbits >= 0); + assert(nbits <= 8); + assert(val < 1U << nbits); shift = 8 - nbits; mask = ((1U << nbits) - 1) << shift; if (enc->offs > 0) { @@ -318,12 +277,9 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { uint32_t storage; uint16_t *buf; uint32_t offs; - uint32_t end_offs; - int nend_bits; od_ec_window m; od_ec_window e; od_ec_window l; - unsigned r; int c; int s; if (enc->error) return NULL; @@ -341,16 +297,10 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { /*We output the minimum number of bits that ensures that the symbols encoded thus far will be decoded correctly regardless of the bits that follow.*/ l = enc->low; - r = enc->rng; c = enc->cnt; - s = 9; - m = 0x7FFF; - e = (l + m) & ~m; - while ((e | m) >= l + r) { - s++; - m >>= 1; - e = (l + m) & ~m; - } + s = 10; + m = 0x3FFF; + e = ((l + m) & ~m) | (m + 1); s += c; offs = enc->offs; buf = enc->precarry_buf; @@ -369,7 +319,7 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { } n = (1 << (c + 16)) - 1; do { - OD_ASSERT(offs < storage); + assert(offs < storage); buf[offs++] = (uint16_t)(e >> (c + 16)); e &= n; s -= 8; @@ -377,49 +327,31 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { n >>= 8; } while (s > 0); } - /*Make sure there's enough room for the entropy-coded bits and the raw - bits.*/ + /*Make sure there's enough room for the entropy-coded bits.*/ out = enc->buf; storage = enc->storage; - end_offs = enc->end_offs; - e = enc->end_window; - nend_bits = enc->nend_bits; - s = -s; - c = OD_MAXI((nend_bits - s + 7) >> 3, 0); - if (offs + end_offs + c > storage) { - storage = offs + end_offs + c; + c = OD_MAXI((s + 7) >> 3, 0); + if (offs + c > storage) { + storage = offs + c; out = (unsigned char *)realloc(out, sizeof(*out) * storage); if (out == NULL) { enc->error = -1; return NULL; } - OD_MOVE(out + storage - end_offs, out + enc->storage - end_offs, end_offs); enc->buf = out; enc->storage = storage; } - /*If we have buffered raw bits, flush them as well.*/ - while (nend_bits > s) { - OD_ASSERT(end_offs < storage); - out[storage - ++end_offs] = (unsigned char)e; - e >>= 8; - nend_bits -= 8; - } - *nbytes = offs + end_offs; + *nbytes = offs; /*Perform carry propagation.*/ - OD_ASSERT(offs + end_offs <= storage); - out = out + storage - (offs + end_offs); + assert(offs <= storage); + out = out + storage - offs; c = 0; - end_offs = offs; while (offs > 0) { offs--; c = buf[offs] + c; out[offs] = (unsigned char)c; c >>= 8; } - /*Add any remaining raw bits to the last byte. - There is guaranteed to be enough room, because nend_bits <= s.*/ - OD_ASSERT(nend_bits <= 0 || end_offs > 0); - if (nend_bits > 0) out[end_offs - 1] |= (unsigned char)e; /*Note: Unless there's an allocation error, if you keep encoding into the current buffer and call this function again later, everything will work just fine (you won't get a new packet out, but you will get a single @@ -441,7 +373,7 @@ unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) { int od_ec_enc_tell(const od_ec_enc *enc) { /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra bit, which we reserve for terminating the stream.*/ - return (enc->offs + enc->end_offs) * 8 + enc->cnt + enc->nend_bits + 10; + return (enc->cnt + 10) + enc->offs * 8; } /*Returns the number of bits "used" by the encoded symbols so far. @@ -476,8 +408,8 @@ void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) { uint32_t storage; uint16_t *precarry_buf; uint32_t precarry_storage; - OD_ASSERT(dst->storage >= src->storage); - OD_ASSERT(dst->precarry_storage >= src->precarry_storage); + assert(dst->storage >= src->storage); + assert(dst->precarry_storage >= src->precarry_storage); buf = dst->buf; storage = dst->storage; precarry_buf = dst->precarry_buf; diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h index 314b36318..1988f6818 100644 --- a/third_party/aom/aom_dsp/entenc.h +++ b/third_party/aom/aom_dsp/entenc.h @@ -30,12 +30,6 @@ struct od_ec_enc { unsigned char *buf; /*The size of the buffer.*/ uint32_t storage; - /*The offset at which the last byte containing raw bits was written.*/ - uint32_t end_offs; - /*Bits that will be read from/written at the end.*/ - od_ec_window end_window; - /*Number of valid bits in end_window.*/ - int nend_bits; /*A buffer for output bytes with their associated carry flags.*/ uint16_t *precarry_buf; /*The size of the pre-carry buffer.*/ diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c index 09d945afc..3804519b3 100644 --- a/third_party/aom/aom_dsp/fastssim.c +++ b/third_party/aom/aom_dsp/fastssim.c @@ -15,8 +15,10 @@ #include #include #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/ssim.h" #include "aom_ports/system_state.h" @@ -25,12 +27,11 @@ typedef struct fs_ctx fs_ctx; #define SSIM_C1 (255 * 255 * 0.01 * 0.01) #define SSIM_C2 (255 * 255 * 0.03 * 0.03) -#if CONFIG_HIGHBITDEPTH #define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01) #define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01) #define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03) #define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03) -#endif + #define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b)) #define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b)) @@ -139,8 +140,8 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) { static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, int _s1ystride, const uint8_t *_src2, - int _s2ystride, int _w, int _h, uint32_t bd, - uint32_t shift) { + int _s2ystride, int _w, int _h, uint32_t shift, + int buf_is_hbd) { uint32_t *dst1; uint32_t *dst2; int w; @@ -161,7 +162,7 @@ static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1, int i1; i0 = 2 * i; i1 = FS_MINI(i0 + 1, _w); - if (bd == 8 && shift == 0) { + if (!buf_is_hbd) { dst1[j * w + i] = _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1]; @@ -198,13 +199,10 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) { int i; int j; double ssim_c1 = SSIM_C1; -#if CONFIG_HIGHBITDEPTH + if (bit_depth == 10) ssim_c1 = SSIM_C1_10; if (bit_depth == 12) ssim_c1 = SSIM_C1_12; -#else - assert(bit_depth == 8); - (void)bit_depth; -#endif + w = _ctx->level[_l].w; h = _ctx->level[_l].h; col_sums_x = _ctx->col_buf; @@ -323,13 +321,8 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) { int i; int j; double ssim_c2 = SSIM_C2; -#if CONFIG_HIGHBITDEPTH if (bit_depth == 10) ssim_c2 = SSIM_C2_10; if (bit_depth == 12) ssim_c2 = SSIM_C2_12; -#else - assert(bit_depth == 8); - (void)bit_depth; -#endif w = _ctx->level[_l].w; h = _ctx->level[_l].h; @@ -448,14 +441,14 @@ static double convert_ssim_db(double _ssim, double _weight) { static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst, int _dystride, int _w, int _h, uint32_t _bd, - uint32_t _shift) { + uint32_t _shift, int buf_is_hbd) { fs_ctx ctx; double ret; int l; ret = 1; fs_ctx_init(&ctx, _w, _h, FS_NLEVELS); - fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd, - _shift); + fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift, + buf_is_hbd); for (l = 0; l < FS_NLEVELS - 1; l++) { fs_calc_structure(&ctx, l, _bd); ret *= fs_average(&ctx, l); @@ -476,18 +469,19 @@ double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, uint32_t bd_shift = 0; aom_clear_system_state(); assert(bd >= in_bd); - + assert(source->flags == dest->flags); + int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH; bd_shift = bd - in_bd; *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer, dest->y_stride, source->y_crop_width, - source->y_crop_height, in_bd, bd_shift); + source->y_crop_height, in_bd, bd_shift, buf_is_hbd); *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer, dest->uv_stride, source->uv_crop_width, - source->uv_crop_height, in_bd, bd_shift); + source->uv_crop_height, in_bd, bd_shift, buf_is_hbd); *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer, dest->uv_stride, source->uv_crop_width, - source->uv_crop_height, in_bd, bd_shift); + source->uv_crop_height, in_bd, bd_shift, buf_is_hbd); ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v)); return convert_ssim_db(ssimv, 1.0); } diff --git a/third_party/aom/aom_dsp/fft.c b/third_party/aom/aom_dsp/fft.c new file mode 100644 index 000000000..0ba71cfb3 --- /dev/null +++ b/third_party/aom/aom_dsp/fft.c @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +static INLINE void simple_transpose(const float *A, float *B, int n) { + for (int y = 0; y < n; y++) { + for (int x = 0; x < n; x++) { + B[y * n + x] = A[x * n + y]; + } + } +} + +// The 1d transform is real to complex and packs the complex results in +// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real +// components, followed by the n/2 - 1 imaginary components). After the +// transform is done on the rows, the first n/2 + 1 columns are real, and +// the remaining are the imaginary components. After the transform on the +// columns, the region of [0, n/2]x[0, n/2] contains the real part of +// fft of the real columns. The real part of the 2d fft also includes the +// imaginary part of transformed imaginary columns. This function assembles +// the correct outputs while putting the real and imaginary components +// next to each other. +static INLINE void unpack_2d_output(const float *col_fft, float *output, + int n) { + for (int y = 0; y <= n / 2; ++y) { + const int y2 = y + n / 2; + const int y_extra = y2 > n / 2 && y2 < n; + + for (int x = 0; x <= n / 2; ++x) { + const int x2 = x + n / 2; + const int x_extra = x2 > n / 2 && x2 < n; + output[2 * (y * n + x)] = + col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); + output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) + + (x_extra ? col_fft[y * n + x2] : 0); + if (y_extra) { + output[2 * ((n - y) * n + x)] = + col_fft[y * n + x] + + (x_extra && y_extra ? col_fft[y2 * n + x2] : 0); + output[2 * ((n - y) * n + x) + 1] = + -(y_extra ? col_fft[y2 * n + x] : 0) + + (x_extra ? col_fft[y * n + x2] : 0); + } + } + } +} + +void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, + aom_fft_unpack_func_t unpack, int vec_size) { + for (int x = 0; x < n; x += vec_size) { + tform(input + x, output + x, n); + } + transpose(output, temp, n); + + for (int x = 0; x < n; x += vec_size) { + tform(temp + x, output + x, n); + } + transpose(output, temp, n); + + unpack(temp, output, n); +} + +static INLINE void store_float(float *output, float input) { *output = input; } +static INLINE float add_float(float a, float b) { return a + b; } +static INLINE float sub_float(float a, float b) { return a - b; } +static INLINE float mul_float(float a, float b) { return a * b; } + +GEN_FFT_2(void, float, float, float, *, store_float); +GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float, + sub_float); +GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); +GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); +GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); + +void aom_fft2x2_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft4x4_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft8x8_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft16x16_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_fft32x32_float_c(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose, + unpack_2d_output, 1); +} + +void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, + aom_fft_1d_func_t ifft_multi, + aom_fft_transpose_func_t transpose, int vec_size) { + // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft + // and get real outputs. + for (int y = 0; y <= n / 2; ++y) { + output[y * n] = input[2 * y * n]; + output[y * n + 1] = input[2 * (y * n + n / 2)]; + } + for (int y = n / 2 + 1; y < n; ++y) { + output[y * n] = input[2 * (y - n / 2) * n + 1]; + output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1]; + } + + for (int i = 0; i < 2; i += vec_size) { + ifft_multi(output + i, temp + i, n); + } + + // For the other columns, since we don't have a full ifft for complex inputs + // we have to split them into the real and imaginary counterparts. + // Pack the real component, then the imaginary components. + for (int y = 0; y < n; ++y) { + for (int x = 1; x < n / 2; ++x) { + output[y * n + (x + 1)] = input[2 * (y * n + x)]; + } + for (int x = 1; x < n / 2; ++x) { + output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1]; + } + } + for (int y = 2; y < vec_size; y++) { + fft_single(output + y, temp + y, n); + } + // This is the part that can be sped up with SIMD + for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) { + fft_multi(output + y, temp + y, n); + } + + // Put the 0 and n/2 th results in the correct place. + for (int x = 0; x < n; ++x) { + output[x] = temp[x * n]; + output[(n / 2) * n + x] = temp[x * n + 1]; + } + // This rearranges and transposes. + for (int y = 1; y < n / 2; ++y) { + // Fill in the real columns + for (int x = 0; x <= n / 2; ++x) { + output[x + y * n] = + temp[(y + 1) + x * n] + + ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0); + } + for (int x = n / 2 + 1; x < n; ++x) { + output[x + y * n] = temp[(y + 1) + (n - x) * n] - + temp[(y + n / 2) + ((n - x) + n / 2) * n]; + } + // Fill in the imag columns + for (int x = 0; x <= n / 2; ++x) { + output[x + (y + n / 2) * n] = + temp[(y + n / 2) + x * n] - + ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0); + } + for (int x = n / 2 + 1; x < n; ++x) { + output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] + + temp[(y + n / 2) + (n - x) * n]; + } + } + for (int y = 0; y < n; y += vec_size) { + ifft_multi(output + y, temp + y, n); + } + transpose(temp, output, n); +} + +GEN_IFFT_2(void, float, float, float, *, store_float); +GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float, + sub_float); +GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); +GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); +GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float, + sub_float, mul_float); + +void aom_ifft2x2_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float, + aom_ifft1d_2_float, simple_transpose, 1); +} + +void aom_ifft4x4_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float, + aom_ifft1d_4_float, simple_transpose, 1); +} + +void aom_ifft8x8_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float, + aom_ifft1d_8_float, simple_transpose, 1); +} + +void aom_ifft16x16_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1); +} + +void aom_ifft32x32_float_c(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1); +} diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h new file mode 100644 index 000000000..2f3cd5fdc --- /dev/null +++ b/third_party/aom/aom_dsp/fft_common.h @@ -0,0 +1,1050 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_FFT_COMMON_H_ +#define AOM_DSP_FFT_COMMON_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief A function pointer for computing 1d fft and ifft. + * + * The function will point to an implementation for a specific transform size, + * and may perform the transforms using vectorized instructions. + * + * For a non-vectorized forward transforms of size n, the input and output + * buffers will be size n. The output takes advantage of conjugate symmetry and + * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where + * (r_{j}, i_{j}) is the complex output for index j. + * + * An inverse transform will assume that the complex "input" is packed + * similarly. Its output will be real. + * + * Non-vectorized transforms (e.g., on a single row) would use a stride = 1. + * + * Vectorized implementations are parallelized along the columns so that the fft + * can be performed on multiple columns at a time. In such cases the data block + * for input and output is typically square (n x n) and the stride will + * correspond to the spacing between rows. At minimum, the input size must be + * n x simd_vector_length. + * + * \param[in] input Input buffer. See above for size restrictions. + * \param[out] output Output buffer. See above for size restrictions. + * \param[in] stride The spacing in number of elements between rows + * (or elements) + */ +typedef void (*aom_fft_1d_func_t)(const float *input, float *output, + int stride); + +// Declare some of the forward non-vectorized transforms which are used in some +// of the vectorized implementations +void aom_fft1d_4_float(const float *input, float *output, int stride); +void aom_fft1d_8_float(const float *input, float *output, int stride); +void aom_fft1d_16_float(const float *input, float *output, int stride); +void aom_fft1d_32_float(const float *input, float *output, int stride); + +/**\!brief Function pointer for transposing a matrix of floats. + * + * \param[in] input Input buffer (size n x n) + * \param[out] output Output buffer (size n x n) + * \param[in] n Extent of one dimension of the square matrix. + */ +typedef void (*aom_fft_transpose_func_t)(const float *input, float *output, + int n); + +/**\!brief Function pointer for re-arranging intermediate 2d transform results. + * + * After re-arrangement, the real and imaginary components will be packed + * tightly next to each other. + * + * \param[in] input Input buffer (size n x n) + * \param[out] output Output buffer (size 2 x n x n) + * \param[in] n Extent of one dimension of the square matrix. + */ +typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n); + +/*!\brief Performs a 2d fft with the given functions. + * + * This generator function allows for multiple different implementations of 2d + * fft with different vector operations, without having to redefine the main + * body multiple times. + * + * \param[in] input Input buffer to run the transform on (size n x n) + * \param[out] temp Working buffer for computing the transform (size n x n) + * \param[out] output Output buffer (size 2 x n x n) + * \param[in] tform Forward transform function + * \param[in] transpose Transpose function (for n x n matrix) + * \param[in] unpack Unpack function used to massage outputs to correct form + * \param[in] vec_size Vector size (the transform is done vec_size units at + * a time) + */ +void aom_fft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose, + aom_fft_unpack_func_t unpack, int vec_size); + +/*!\brief Perform a 2d inverse fft with the given helper functions + * + * \param[in] input Input buffer to run the transform on (size 2 x n x n) + * \param[out] temp Working buffer for computations (size 2 x n x n) + * \param[out] output Output buffer (size n x n) + * \param[in] fft_single Forward transform function (non vectorized) + * \param[in] fft_multi Forward transform function (vectorized) + * \param[in] ifft_multi Inverse transform function (vectorized) + * \param[in] transpose Transpose function (for n x n matrix) + * \param[in] vec_size Vector size (the transform is done vec_size + * units at a time) + */ +void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, + aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi, + aom_fft_1d_func_t ifft_multi, + aom_fft_transpose_func_t transpose, int vec_size); +#ifdef __cplusplus +} +#endif + +// The macros below define 1D fft/ifft for different data types and for +// different simd vector intrinsic types. + +#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \ + ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + store(output + 0 * stride, i0 + i1); \ + store(output + 1 * stride, i0 - i1); \ + } + +#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ + ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC w0 = add(i0, i2); \ + const T_VEC w1 = sub(i0, i2); \ + const T_VEC w2 = add(i1, i3); \ + const T_VEC w3 = sub(i1, i3); \ + store(output + 0 * stride, add(w0, w2)); \ + store(output + 1 * stride, w1); \ + store(output + 2 * stride, sub(w0, w2)); \ + store(output + 3 * stride, sub(kWeight0, w3)); \ + } + +#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \ + ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC w0 = add(i0, i4); \ + const T_VEC w1 = sub(i0, i4); \ + const T_VEC w2 = add(i2, i6); \ + const T_VEC w3 = sub(i2, i6); \ + const T_VEC w4 = add(w0, w2); \ + const T_VEC w5 = sub(w0, w2); \ + const T_VEC w7 = add(i1, i5); \ + const T_VEC w8 = sub(i1, i5); \ + const T_VEC w9 = add(i3, i7); \ + const T_VEC w10 = sub(i3, i7); \ + const T_VEC w11 = add(w7, w9); \ + const T_VEC w12 = sub(w7, w9); \ + store(output + 0 * stride, add(w4, w11)); \ + store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \ + store(output + 2 * stride, w5); \ + store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \ + store(output + 4 * stride, sub(w4, w11)); \ + store(output + 5 * stride, \ + sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \ + store(output + 6 * stride, sub(kWeight0, w12)); \ + store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \ + } + +#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC w0 = add(i0, i8); \ + const T_VEC w1 = sub(i0, i8); \ + const T_VEC w2 = add(i4, i12); \ + const T_VEC w3 = sub(i4, i12); \ + const T_VEC w4 = add(w0, w2); \ + const T_VEC w5 = sub(w0, w2); \ + const T_VEC w7 = add(i2, i10); \ + const T_VEC w8 = sub(i2, i10); \ + const T_VEC w9 = add(i6, i14); \ + const T_VEC w10 = sub(i6, i14); \ + const T_VEC w11 = add(w7, w9); \ + const T_VEC w12 = sub(w7, w9); \ + const T_VEC w14 = add(w4, w11); \ + const T_VEC w15 = sub(w4, w11); \ + const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ + sub(sub(kWeight0, w3), \ + mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ + sub(w3, mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w19 = add(i1, i9); \ + const T_VEC w20 = sub(i1, i9); \ + const T_VEC w21 = add(i5, i13); \ + const T_VEC w22 = sub(i5, i13); \ + const T_VEC w23 = add(w19, w21); \ + const T_VEC w24 = sub(w19, w21); \ + const T_VEC w26 = add(i3, i11); \ + const T_VEC w27 = sub(i3, i11); \ + const T_VEC w28 = add(i7, i15); \ + const T_VEC w29 = sub(i7, i15); \ + const T_VEC w30 = add(w26, w28); \ + const T_VEC w31 = sub(w26, w28); \ + const T_VEC w33 = add(w23, w30); \ + const T_VEC w34 = sub(w23, w30); \ + const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ + sub(sub(kWeight0, w22), \ + mul(kWeight2, add(w29, w27))) }; \ + const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ + sub(w22, mul(kWeight2, add(w29, w27))) }; \ + store(output + 0 * stride, add(w14, w33)); \ + store(output + 1 * stride, \ + add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \ + store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \ + store(output + 3 * stride, \ + add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \ + store(output + 4 * stride, w15); \ + store(output + 5 * stride, \ + add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \ + mul(kWeight3, w37[1])))); \ + store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \ + store(output + 7 * stride, \ + add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \ + mul(kWeight4, w35[1])))); \ + store(output + 8 * stride, sub(w14, w33)); \ + store(output + 9 * stride, \ + add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \ + store(output + 10 * stride, \ + sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \ + store(output + 11 * stride, \ + add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \ + store(output + 12 * stride, sub(kWeight0, w34)); \ + store(output + 13 * stride, \ + sub(sub(kWeight0, w18[1]), \ + sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \ + store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \ + store(output + 15 * stride, \ + sub(sub(kWeight0, w16[1]), \ + sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \ + } + +#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC kWeight5 = constant(0.980785f); \ + const T_VEC kWeight6 = constant(0.19509f); \ + const T_VEC kWeight7 = constant(0.83147f); \ + const T_VEC kWeight8 = constant(0.55557f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC i16 = load(input + 16 * stride); \ + const T_VEC i17 = load(input + 17 * stride); \ + const T_VEC i18 = load(input + 18 * stride); \ + const T_VEC i19 = load(input + 19 * stride); \ + const T_VEC i20 = load(input + 20 * stride); \ + const T_VEC i21 = load(input + 21 * stride); \ + const T_VEC i22 = load(input + 22 * stride); \ + const T_VEC i23 = load(input + 23 * stride); \ + const T_VEC i24 = load(input + 24 * stride); \ + const T_VEC i25 = load(input + 25 * stride); \ + const T_VEC i26 = load(input + 26 * stride); \ + const T_VEC i27 = load(input + 27 * stride); \ + const T_VEC i28 = load(input + 28 * stride); \ + const T_VEC i29 = load(input + 29 * stride); \ + const T_VEC i30 = load(input + 30 * stride); \ + const T_VEC i31 = load(input + 31 * stride); \ + const T_VEC w0 = add(i0, i16); \ + const T_VEC w1 = sub(i0, i16); \ + const T_VEC w2 = add(i8, i24); \ + const T_VEC w3 = sub(i8, i24); \ + const T_VEC w4 = add(w0, w2); \ + const T_VEC w5 = sub(w0, w2); \ + const T_VEC w7 = add(i4, i20); \ + const T_VEC w8 = sub(i4, i20); \ + const T_VEC w9 = add(i12, i28); \ + const T_VEC w10 = sub(i12, i28); \ + const T_VEC w11 = add(w7, w9); \ + const T_VEC w12 = sub(w7, w9); \ + const T_VEC w14 = add(w4, w11); \ + const T_VEC w15 = sub(w4, w11); \ + const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \ + sub(sub(kWeight0, w3), \ + mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \ + sub(w3, mul(kWeight2, add(w10, w8))) }; \ + const T_VEC w19 = add(i2, i18); \ + const T_VEC w20 = sub(i2, i18); \ + const T_VEC w21 = add(i10, i26); \ + const T_VEC w22 = sub(i10, i26); \ + const T_VEC w23 = add(w19, w21); \ + const T_VEC w24 = sub(w19, w21); \ + const T_VEC w26 = add(i6, i22); \ + const T_VEC w27 = sub(i6, i22); \ + const T_VEC w28 = add(i14, i30); \ + const T_VEC w29 = sub(i14, i30); \ + const T_VEC w30 = add(w26, w28); \ + const T_VEC w31 = sub(w26, w28); \ + const T_VEC w33 = add(w23, w30); \ + const T_VEC w34 = sub(w23, w30); \ + const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \ + sub(sub(kWeight0, w22), \ + mul(kWeight2, add(w29, w27))) }; \ + const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \ + sub(w22, mul(kWeight2, add(w29, w27))) }; \ + const T_VEC w38 = add(w14, w33); \ + const T_VEC w39 = sub(w14, w33); \ + const T_VEC w40[2] = { \ + add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \ + add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \ + }; \ + const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \ + sub(sub(kWeight0, w12), \ + mul(kWeight2, add(w31, w24))) }; \ + const T_VEC w42[2] = { \ + add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \ + add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \ + }; \ + const T_VEC w44[2] = { \ + add(w18[0], \ + sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \ + sub(sub(kWeight0, w18[1]), \ + sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \ + }; \ + const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \ + sub(w12, mul(kWeight2, add(w31, w24))) }; \ + const T_VEC w46[2] = { \ + add(w16[0], \ + sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \ + sub(sub(kWeight0, w16[1]), \ + sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \ + }; \ + const T_VEC w47 = add(i1, i17); \ + const T_VEC w48 = sub(i1, i17); \ + const T_VEC w49 = add(i9, i25); \ + const T_VEC w50 = sub(i9, i25); \ + const T_VEC w51 = add(w47, w49); \ + const T_VEC w52 = sub(w47, w49); \ + const T_VEC w54 = add(i5, i21); \ + const T_VEC w55 = sub(i5, i21); \ + const T_VEC w56 = add(i13, i29); \ + const T_VEC w57 = sub(i13, i29); \ + const T_VEC w58 = add(w54, w56); \ + const T_VEC w59 = sub(w54, w56); \ + const T_VEC w61 = add(w51, w58); \ + const T_VEC w62 = sub(w51, w58); \ + const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \ + sub(sub(kWeight0, w50), \ + mul(kWeight2, add(w57, w55))) }; \ + const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \ + sub(w50, mul(kWeight2, add(w57, w55))) }; \ + const T_VEC w66 = add(i3, i19); \ + const T_VEC w67 = sub(i3, i19); \ + const T_VEC w68 = add(i11, i27); \ + const T_VEC w69 = sub(i11, i27); \ + const T_VEC w70 = add(w66, w68); \ + const T_VEC w71 = sub(w66, w68); \ + const T_VEC w73 = add(i7, i23); \ + const T_VEC w74 = sub(i7, i23); \ + const T_VEC w75 = add(i15, i31); \ + const T_VEC w76 = sub(i15, i31); \ + const T_VEC w77 = add(w73, w75); \ + const T_VEC w78 = sub(w73, w75); \ + const T_VEC w80 = add(w70, w77); \ + const T_VEC w81 = sub(w70, w77); \ + const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \ + sub(sub(kWeight0, w69), \ + mul(kWeight2, add(w76, w74))) }; \ + const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \ + sub(w69, mul(kWeight2, add(w76, w74))) }; \ + const T_VEC w85 = add(w61, w80); \ + const T_VEC w86 = sub(w61, w80); \ + const T_VEC w87[2] = { \ + add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \ + add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \ + }; \ + const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \ + sub(sub(kWeight0, w59), \ + mul(kWeight2, add(w78, w71))) }; \ + const T_VEC w89[2] = { \ + add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \ + add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \ + }; \ + const T_VEC w91[2] = { \ + add(w65[0], \ + sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \ + sub(sub(kWeight0, w65[1]), \ + sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \ + }; \ + const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \ + sub(w59, mul(kWeight2, add(w78, w71))) }; \ + const T_VEC w93[2] = { \ + add(w63[0], \ + sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \ + sub(sub(kWeight0, w63[1]), \ + sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \ + }; \ + store(output + 0 * stride, add(w38, w85)); \ + store(output + 1 * stride, \ + add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \ + store(output + 2 * stride, \ + add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \ + store(output + 3 * stride, \ + add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \ + store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \ + store(output + 5 * stride, \ + add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \ + store(output + 6 * stride, \ + add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \ + store(output + 7 * stride, \ + add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \ + store(output + 8 * stride, w39); \ + store(output + 9 * stride, \ + add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \ + mul(kWeight5, w93[1])))); \ + store(output + 10 * stride, \ + add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \ + mul(kWeight3, w92[1])))); \ + store(output + 11 * stride, \ + add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \ + mul(kWeight7, w91[1])))); \ + store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \ + store(output + 13 * stride, \ + add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \ + mul(kWeight8, w89[1])))); \ + store(output + 14 * stride, \ + add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \ + mul(kWeight4, w88[1])))); \ + store(output + 15 * stride, \ + add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \ + mul(kWeight6, w87[1])))); \ + store(output + 16 * stride, sub(w38, w85)); \ + store(output + 17 * stride, \ + add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \ + store(output + 18 * stride, \ + add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \ + store(output + 19 * stride, \ + add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \ + store(output + 20 * stride, \ + sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \ + store(output + 21 * stride, \ + add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \ + store(output + 22 * stride, \ + add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \ + store(output + 23 * stride, \ + add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \ + store(output + 24 * stride, sub(kWeight0, w86)); \ + store(output + 25 * stride, \ + sub(sub(kWeight0, w46[1]), \ + sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \ + store(output + 26 * stride, \ + sub(sub(kWeight0, w45[1]), \ + sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \ + store(output + 27 * stride, \ + sub(sub(kWeight0, w44[1]), \ + sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \ + store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \ + store(output + 29 * stride, \ + sub(sub(kWeight0, w42[1]), \ + sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \ + store(output + 30 * stride, \ + sub(sub(kWeight0, w41[1]), \ + sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \ + store(output + 31 * stride, \ + sub(sub(kWeight0, w40[1]), \ + sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \ + } + +#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \ + ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + store(output + 0 * stride, i0 + i1); \ + store(output + 1 * stride, i0 - i1); \ + } + +#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \ + ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC w2 = add(i0, i2); \ + const T_VEC w3 = sub(i0, i2); \ + const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \ + const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \ + store(output + 0 * stride, add(w2, w4[0])); \ + store(output + 1 * stride, add(w3, w5[1])); \ + store(output + 2 * stride, sub(w2, w4[0])); \ + store(output + 3 * stride, sub(w3, w5[1])); \ + } + +#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC w6 = add(i0, i4); \ + const T_VEC w7 = sub(i0, i4); \ + const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \ + const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \ + const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \ + const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \ + const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \ + const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \ + const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \ + const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \ + const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \ + const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \ + const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \ + const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \ + const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \ + const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \ + store(output + 0 * stride, add(w10[0], w18[0])); \ + store(output + 1 * stride, \ + add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \ + store(output + 2 * stride, add(w11[0], w19[1])); \ + store(output + 3 * stride, \ + sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ + store(output + 4 * stride, sub(w10[0], w18[0])); \ + store(output + 5 * stride, \ + add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \ + mul(kWeight2, w20[1])))); \ + store(output + 6 * stride, sub(w11[0], w19[1])); \ + store(output + 7 * stride, \ + add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \ + } + +#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC w14 = add(i0, i8); \ + const T_VEC w15 = sub(i0, i8); \ + const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \ + const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \ + const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \ + const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \ + const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \ + const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \ + const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \ + const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \ + const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \ + const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \ + const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \ + const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \ + const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \ + const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \ + const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \ + const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \ + const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \ + add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \ + const T_VEC w33[2] = { add(w20[0], \ + sub(sub(kWeight0, mul(kWeight2, w28[0])), \ + mul(kWeight2, w28[1]))), \ + add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \ + const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \ + const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \ + const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ + sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ + const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \ + add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \ + const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \ + const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \ + const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \ + const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \ + const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ + const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ + const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ + const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ + const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \ + const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \ + const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \ + const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \ + const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \ + const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \ + const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \ + const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \ + const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \ + const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \ + const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \ + add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \ + const T_VEC w57[2] = { add(w44[0], \ + sub(sub(kWeight0, mul(kWeight2, w52[0])), \ + mul(kWeight2, w52[1]))), \ + add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \ + const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \ + const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \ + const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ + sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ + const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \ + add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \ + store(output + 0 * stride, add(w30[0], w54[0])); \ + store(output + 1 * stride, \ + add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \ + store(output + 2 * stride, \ + add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \ + store(output + 3 * stride, \ + add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \ + store(output + 4 * stride, add(w31[0], w55[1])); \ + store(output + 5 * stride, \ + sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ + store(output + 6 * stride, \ + sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ + store(output + 7 * stride, \ + sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ + store(output + 8 * stride, sub(w30[0], w54[0])); \ + store(output + 9 * stride, \ + add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \ + mul(kWeight4, w56[1])))); \ + store(output + 10 * stride, \ + add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \ + mul(kWeight2, w58[1])))); \ + store(output + 11 * stride, \ + add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \ + mul(kWeight3, w60[1])))); \ + store(output + 12 * stride, sub(w31[0], w55[1])); \ + store(output + 13 * stride, \ + add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \ + store(output + 14 * stride, \ + add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \ + store(output + 15 * stride, \ + add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \ + } +#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \ + mul) \ + ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \ + const T_VEC kWeight0 = constant(0.0f); \ + const T_VEC kWeight2 = constant(0.707107f); \ + const T_VEC kWeight3 = constant(0.92388f); \ + const T_VEC kWeight4 = constant(0.382683f); \ + const T_VEC kWeight5 = constant(0.980785f); \ + const T_VEC kWeight6 = constant(0.19509f); \ + const T_VEC kWeight7 = constant(0.83147f); \ + const T_VEC kWeight8 = constant(0.55557f); \ + const T_VEC i0 = load(input + 0 * stride); \ + const T_VEC i1 = load(input + 1 * stride); \ + const T_VEC i2 = load(input + 2 * stride); \ + const T_VEC i3 = load(input + 3 * stride); \ + const T_VEC i4 = load(input + 4 * stride); \ + const T_VEC i5 = load(input + 5 * stride); \ + const T_VEC i6 = load(input + 6 * stride); \ + const T_VEC i7 = load(input + 7 * stride); \ + const T_VEC i8 = load(input + 8 * stride); \ + const T_VEC i9 = load(input + 9 * stride); \ + const T_VEC i10 = load(input + 10 * stride); \ + const T_VEC i11 = load(input + 11 * stride); \ + const T_VEC i12 = load(input + 12 * stride); \ + const T_VEC i13 = load(input + 13 * stride); \ + const T_VEC i14 = load(input + 14 * stride); \ + const T_VEC i15 = load(input + 15 * stride); \ + const T_VEC i16 = load(input + 16 * stride); \ + const T_VEC i17 = load(input + 17 * stride); \ + const T_VEC i18 = load(input + 18 * stride); \ + const T_VEC i19 = load(input + 19 * stride); \ + const T_VEC i20 = load(input + 20 * stride); \ + const T_VEC i21 = load(input + 21 * stride); \ + const T_VEC i22 = load(input + 22 * stride); \ + const T_VEC i23 = load(input + 23 * stride); \ + const T_VEC i24 = load(input + 24 * stride); \ + const T_VEC i25 = load(input + 25 * stride); \ + const T_VEC i26 = load(input + 26 * stride); \ + const T_VEC i27 = load(input + 27 * stride); \ + const T_VEC i28 = load(input + 28 * stride); \ + const T_VEC i29 = load(input + 29 * stride); \ + const T_VEC i30 = load(input + 30 * stride); \ + const T_VEC i31 = load(input + 31 * stride); \ + const T_VEC w30 = add(i0, i16); \ + const T_VEC w31 = sub(i0, i16); \ + const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \ + const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \ + const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \ + const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \ + const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \ + const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \ + const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \ + const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \ + const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \ + const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \ + const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \ + const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \ + const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \ + const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \ + const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \ + const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \ + const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \ + add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \ + const T_VEC w49[2] = { add(w36[0], \ + sub(sub(kWeight0, mul(kWeight2, w44[0])), \ + mul(kWeight2, w44[1]))), \ + add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \ + const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \ + const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \ + const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ + sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ + const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \ + add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \ + const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \ + const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \ + const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \ + const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \ + const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \ + const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \ + const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \ + const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \ + const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \ + const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \ + const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \ + const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \ + const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \ + const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \ + const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \ + const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \ + const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \ + const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \ + const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \ + add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \ + const T_VEC w73[2] = { add(w60[0], \ + sub(sub(kWeight0, mul(kWeight2, w68[0])), \ + mul(kWeight2, w68[1]))), \ + add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \ + const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \ + const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \ + const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ + sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ + const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \ + add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \ + const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \ + const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \ + const T_VEC w80[2] = { \ + add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \ + add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \ + }; \ + const T_VEC w81[2] = { \ + add(w48[0], \ + sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \ + add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \ + }; \ + const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \ + add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \ + const T_VEC w83[2] = { add(w50[0], \ + sub(sub(kWeight0, mul(kWeight2, w74[0])), \ + mul(kWeight2, w74[1]))), \ + add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \ + const T_VEC w84[2] = { \ + add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \ + add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \ + }; \ + const T_VEC w85[2] = { \ + add(w52[0], \ + sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \ + add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \ + }; \ + const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \ + const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \ + const T_VEC w88[2] = { \ + sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ + add(w49[1], \ + sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \ + }; \ + const T_VEC w89[2] = { \ + add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \ + add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \ + }; \ + const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ + sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ + const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \ + add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \ + const T_VEC w92[2] = { \ + sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ + add(w53[1], \ + sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \ + }; \ + const T_VEC w93[2] = { \ + add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \ + add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \ + }; \ + const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \ + const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \ + const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \ + const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \ + const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \ + const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \ + const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \ + const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \ + const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \ + const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \ + const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \ + const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \ + const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \ + const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \ + const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \ + const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \ + const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \ + const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \ + const T_VEC w112[2] = { \ + add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \ + add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \ + }; \ + const T_VEC w113[2] = { \ + add(w100[0], \ + sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \ + add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \ + }; \ + const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \ + const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \ + const T_VEC w116[2] = { \ + sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ + sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ + }; \ + const T_VEC w117[2] = { \ + add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \ + add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \ + }; \ + const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \ + const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \ + const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \ + const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \ + const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \ + const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \ + const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \ + const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \ + const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \ + const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \ + const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \ + const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \ + const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \ + const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \ + const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \ + const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \ + const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \ + const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \ + const T_VEC w136[2] = { \ + add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \ + add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \ + }; \ + const T_VEC w137[2] = { \ + add(w124[0], \ + sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \ + add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \ + }; \ + const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \ + const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \ + const T_VEC w140[2] = { \ + sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ + sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ + }; \ + const T_VEC w141[2] = { \ + add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \ + add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \ + }; \ + const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \ + const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \ + const T_VEC w144[2] = { \ + add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \ + add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \ + }; \ + const T_VEC w145[2] = { \ + add(w112[0], \ + sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \ + add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \ + }; \ + const T_VEC w146[2] = { \ + add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \ + add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \ + }; \ + const T_VEC w147[2] = { \ + add(w114[0], \ + sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \ + add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \ + }; \ + const T_VEC w148[2] = { \ + add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \ + add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \ + }; \ + const T_VEC w149[2] = { \ + add(w116[0], \ + sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \ + add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \ + }; \ + const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \ + const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \ + const T_VEC w152[2] = { \ + sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ + add(w113[1], \ + sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \ + }; \ + const T_VEC w153[2] = { \ + add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \ + add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \ + }; \ + const T_VEC w154[2] = { \ + sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ + sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ + }; \ + const T_VEC w155[2] = { \ + add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \ + add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \ + }; \ + const T_VEC w156[2] = { \ + sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ + add(w117[1], \ + sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \ + }; \ + const T_VEC w157[2] = { \ + add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \ + add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \ + }; \ + store(output + 0 * stride, add(w78[0], w142[0])); \ + store(output + 1 * stride, \ + add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \ + store(output + 2 * stride, \ + add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \ + store(output + 3 * stride, \ + add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \ + store(output + 4 * stride, \ + add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \ + store(output + 5 * stride, \ + add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \ + store(output + 6 * stride, \ + add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \ + store(output + 7 * stride, \ + add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \ + store(output + 8 * stride, add(w79[0], w143[1])); \ + store(output + 9 * stride, \ + sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ + store(output + 10 * stride, \ + sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ + store(output + 11 * stride, \ + sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ + store(output + 12 * stride, \ + sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ + store(output + 13 * stride, \ + sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ + store(output + 14 * stride, \ + sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ + store(output + 15 * stride, \ + sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ + store(output + 16 * stride, sub(w78[0], w142[0])); \ + store(output + 17 * stride, \ + add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \ + mul(kWeight6, w144[1])))); \ + store(output + 18 * stride, \ + add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \ + mul(kWeight4, w146[1])))); \ + store(output + 19 * stride, \ + add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \ + mul(kWeight8, w148[1])))); \ + store(output + 20 * stride, \ + add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \ + mul(kWeight2, w150[1])))); \ + store(output + 21 * stride, \ + add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \ + mul(kWeight7, w152[1])))); \ + store(output + 22 * stride, \ + add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \ + mul(kWeight3, w154[1])))); \ + store(output + 23 * stride, \ + add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \ + mul(kWeight5, w156[1])))); \ + store(output + 24 * stride, sub(w79[0], w143[1])); \ + store(output + 25 * stride, \ + add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \ + store(output + 26 * stride, \ + add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \ + store(output + 27 * stride, \ + add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \ + store(output + 28 * stride, \ + add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \ + store(output + 29 * stride, \ + add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \ + store(output + 30 * stride, \ + add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \ + store(output + 31 * stride, \ + add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ + } + +#endif // AOM_DSP_FFT_COMMON_H_ diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c index 1ceef7782..e50f951c1 100644 --- a/third_party/aom/aom_dsp/fwd_txfm.c +++ b/third_party/aom/aom_dsp/fwd_txfm.c @@ -9,84 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "aom_dsp/fwd_txfm.h" #include -#include "./aom_dsp_rtcd.h" - -void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // We need an intermediate buffer between passes. - tran_low_t intermediate[4 * 4]; - const tran_low_t *in_low = NULL; - tran_low_t *out = intermediate; - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - tran_high_t in_high[4]; // canbe16 - tran_high_t step[4]; // canbe16 - tran_high_t temp1, temp2; // needs32 - int i; - for (i = 0; i < 4; ++i) { - // Load inputs. - if (pass == 0) { - in_high[0] = input[0 * stride] * 16; - in_high[1] = input[1 * stride] * 16; - in_high[2] = input[2 * stride] * 16; - in_high[3] = input[3 * stride] * 16; - if (i == 0 && in_high[0]) { - ++in_high[0]; - } - } else { - assert(in_low != NULL); - in_high[0] = in_low[0 * 4]; - in_high[1] = in_low[1 * 4]; - in_high[2] = in_low[2 * 4]; - in_high[3] = in_low[3 * 4]; - ++in_low; - } - // Transform. - step[0] = in_high[0] + in_high[3]; - step[1] = in_high[1] + in_high[2]; - step[2] = in_high[1] - in_high[2]; - step[3] = in_high[0] - in_high[3]; - temp1 = (step[0] + step[1]) * cospi_16_64; - temp2 = (step[0] - step[1]) * cospi_16_64; - out[0] = (tran_low_t)fdct_round_shift(temp1); - out[2] = (tran_low_t)fdct_round_shift(temp2); - temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; - temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; - out[1] = (tran_low_t)fdct_round_shift(temp1); - out[3] = (tran_low_t)fdct_round_shift(temp2); - // Do next column (which is a transposed row in second/horizontal pass) - ++input; - out += 4; - } - // Setup in/out for next pass. - in_low = intermediate; - out = output; - } - - { - int i, j; - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2; - } - } -} - -void aom_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { - int r, c; - tran_low_t sum = 0; - for (r = 0; r < 4; ++r) - for (c = 0; c < 4; ++c) sum += input[r * stride + c]; - - output[0] = sum << 1; -} +#include "aom_dsp/txfm_common.h" +#include "config/aom_dsp_rtcd.h" void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { int i, j; @@ -172,596 +97,7 @@ void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { } } -void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // We need an intermediate buffer between passes. - tran_low_t intermediate[256]; - const tran_low_t *in_low = NULL; - tran_low_t *out = intermediate; - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - tran_high_t step1[8]; // canbe16 - tran_high_t step2[8]; // canbe16 - tran_high_t step3[8]; // canbe16 - tran_high_t in_high[8]; // canbe16 - tran_high_t temp1, temp2; // needs32 - int i; - for (i = 0; i < 16; i++) { - if (0 == pass) { - // Calculate input for the first 8 results. - in_high[0] = (input[0 * stride] + input[15 * stride]) * 4; - in_high[1] = (input[1 * stride] + input[14 * stride]) * 4; - in_high[2] = (input[2 * stride] + input[13 * stride]) * 4; - in_high[3] = (input[3 * stride] + input[12 * stride]) * 4; - in_high[4] = (input[4 * stride] + input[11 * stride]) * 4; - in_high[5] = (input[5 * stride] + input[10 * stride]) * 4; - in_high[6] = (input[6 * stride] + input[9 * stride]) * 4; - in_high[7] = (input[7 * stride] + input[8 * stride]) * 4; - // Calculate input for the next 8 results. - step1[0] = (input[7 * stride] - input[8 * stride]) * 4; - step1[1] = (input[6 * stride] - input[9 * stride]) * 4; - step1[2] = (input[5 * stride] - input[10 * stride]) * 4; - step1[3] = (input[4 * stride] - input[11 * stride]) * 4; - step1[4] = (input[3 * stride] - input[12 * stride]) * 4; - step1[5] = (input[2 * stride] - input[13 * stride]) * 4; - step1[6] = (input[1 * stride] - input[14 * stride]) * 4; - step1[7] = (input[0 * stride] - input[15 * stride]) * 4; - } else { - // Calculate input for the first 8 results. - assert(in_low != NULL); - in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2); - in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2); - in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2); - in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2); - in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2); - in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2); - in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2); - in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2); - // Calculate input for the next 8 results. - step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2); - step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2); - step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2); - step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2); - step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2); - step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2); - step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2); - step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2); - in_low++; - } - // Work on the first eight values; fdct8(input, even_results); - { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 - tran_high_t t0, t1, t2, t3; // needs32 - tran_high_t x0, x1, x2, x3; // canbe16 - - // stage 1 - s0 = in_high[0] + in_high[7]; - s1 = in_high[1] + in_high[6]; - s2 = in_high[2] + in_high[5]; - s3 = in_high[3] + in_high[4]; - s4 = in_high[3] - in_high[4]; - s5 = in_high[2] - in_high[5]; - s6 = in_high[1] - in_high[6]; - s7 = in_high[0] - in_high[7]; - - // fdct4(step, step); - x0 = s0 + s3; - x1 = s1 + s2; - x2 = s1 - s2; - x3 = s0 - s3; - t0 = (x0 + x1) * cospi_16_64; - t1 = (x0 - x1) * cospi_16_64; - t2 = x3 * cospi_8_64 + x2 * cospi_24_64; - t3 = x3 * cospi_24_64 - x2 * cospi_8_64; - out[0] = (tran_low_t)fdct_round_shift(t0); - out[4] = (tran_low_t)fdct_round_shift(t2); - out[8] = (tran_low_t)fdct_round_shift(t1); - out[12] = (tran_low_t)fdct_round_shift(t3); - - // Stage 2 - t0 = (s6 - s5) * cospi_16_64; - t1 = (s6 + s5) * cospi_16_64; - t2 = fdct_round_shift(t0); - t3 = fdct_round_shift(t1); - - // Stage 3 - x0 = s4 + t2; - x1 = s4 - t2; - x2 = s7 - t3; - x3 = s7 + t3; - - // Stage 4 - t0 = x0 * cospi_28_64 + x3 * cospi_4_64; - t1 = x1 * cospi_12_64 + x2 * cospi_20_64; - t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; - t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; - out[2] = (tran_low_t)fdct_round_shift(t0); - out[6] = (tran_low_t)fdct_round_shift(t2); - out[10] = (tran_low_t)fdct_round_shift(t1); - out[14] = (tran_low_t)fdct_round_shift(t3); - } - // Work on the next eight values; step1 -> odd_results - { - // step 2 - temp1 = (step1[5] - step1[2]) * cospi_16_64; - temp2 = (step1[4] - step1[3]) * cospi_16_64; - step2[2] = fdct_round_shift(temp1); - step2[3] = fdct_round_shift(temp2); - temp1 = (step1[4] + step1[3]) * cospi_16_64; - temp2 = (step1[5] + step1[2]) * cospi_16_64; - step2[4] = fdct_round_shift(temp1); - step2[5] = fdct_round_shift(temp2); - // step 3 - step3[0] = step1[0] + step2[3]; - step3[1] = step1[1] + step2[2]; - step3[2] = step1[1] - step2[2]; - step3[3] = step1[0] - step2[3]; - step3[4] = step1[7] - step2[4]; - step3[5] = step1[6] - step2[5]; - step3[6] = step1[6] + step2[5]; - step3[7] = step1[7] + step2[4]; - // step 4 - temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; - temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; - step2[1] = fdct_round_shift(temp1); - step2[2] = fdct_round_shift(temp2); - temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; - temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; - step2[5] = fdct_round_shift(temp1); - step2[6] = fdct_round_shift(temp2); - // step 5 - step1[0] = step3[0] + step2[1]; - step1[1] = step3[0] - step2[1]; - step1[2] = step3[3] + step2[2]; - step1[3] = step3[3] - step2[2]; - step1[4] = step3[4] - step2[5]; - step1[5] = step3[4] + step2[5]; - step1[6] = step3[7] - step2[6]; - step1[7] = step3[7] + step2[6]; - // step 6 - temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; - temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; - out[1] = (tran_low_t)fdct_round_shift(temp1); - out[9] = (tran_low_t)fdct_round_shift(temp2); - temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; - temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; - out[5] = (tran_low_t)fdct_round_shift(temp1); - out[13] = (tran_low_t)fdct_round_shift(temp2); - temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; - temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; - out[3] = (tran_low_t)fdct_round_shift(temp1); - out[11] = (tran_low_t)fdct_round_shift(temp2); - temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; - temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; - out[7] = (tran_low_t)fdct_round_shift(temp1); - out[15] = (tran_low_t)fdct_round_shift(temp2); - } - // Do next column (which is a transposed row in second/horizontal pass) - input++; - out += 16; - } - // Setup in/out for next pass. - in_low = intermediate; - out = output; - } -} - -static INLINE tran_high_t dct_32_round(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - // TODO(debargha, peter.derivaz): Find new bounds for this assert, - // and make the bounds consts. - // assert(-131072 <= rv && rv <= 131071); - return rv; -} - -static INLINE tran_high_t half_round_shift(tran_high_t input) { - tran_high_t rv = (input + 1 + (input < 0)) >> 2; - return rv; -} - -void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round) { - tran_high_t step[32]; - // Stage 1 - step[0] = input[0] + input[(32 - 1)]; - step[1] = input[1] + input[(32 - 2)]; - step[2] = input[2] + input[(32 - 3)]; - step[3] = input[3] + input[(32 - 4)]; - step[4] = input[4] + input[(32 - 5)]; - step[5] = input[5] + input[(32 - 6)]; - step[6] = input[6] + input[(32 - 7)]; - step[7] = input[7] + input[(32 - 8)]; - step[8] = input[8] + input[(32 - 9)]; - step[9] = input[9] + input[(32 - 10)]; - step[10] = input[10] + input[(32 - 11)]; - step[11] = input[11] + input[(32 - 12)]; - step[12] = input[12] + input[(32 - 13)]; - step[13] = input[13] + input[(32 - 14)]; - step[14] = input[14] + input[(32 - 15)]; - step[15] = input[15] + input[(32 - 16)]; - step[16] = -input[16] + input[(32 - 17)]; - step[17] = -input[17] + input[(32 - 18)]; - step[18] = -input[18] + input[(32 - 19)]; - step[19] = -input[19] + input[(32 - 20)]; - step[20] = -input[20] + input[(32 - 21)]; - step[21] = -input[21] + input[(32 - 22)]; - step[22] = -input[22] + input[(32 - 23)]; - step[23] = -input[23] + input[(32 - 24)]; - step[24] = -input[24] + input[(32 - 25)]; - step[25] = -input[25] + input[(32 - 26)]; - step[26] = -input[26] + input[(32 - 27)]; - step[27] = -input[27] + input[(32 - 28)]; - step[28] = -input[28] + input[(32 - 29)]; - step[29] = -input[29] + input[(32 - 30)]; - step[30] = -input[30] + input[(32 - 31)]; - step[31] = -input[31] + input[(32 - 32)]; - - // Stage 2 - output[0] = step[0] + step[16 - 1]; - output[1] = step[1] + step[16 - 2]; - output[2] = step[2] + step[16 - 3]; - output[3] = step[3] + step[16 - 4]; - output[4] = step[4] + step[16 - 5]; - output[5] = step[5] + step[16 - 6]; - output[6] = step[6] + step[16 - 7]; - output[7] = step[7] + step[16 - 8]; - output[8] = -step[8] + step[16 - 9]; - output[9] = -step[9] + step[16 - 10]; - output[10] = -step[10] + step[16 - 11]; - output[11] = -step[11] + step[16 - 12]; - output[12] = -step[12] + step[16 - 13]; - output[13] = -step[13] + step[16 - 14]; - output[14] = -step[14] + step[16 - 15]; - output[15] = -step[15] + step[16 - 16]; - - output[16] = step[16]; - output[17] = step[17]; - output[18] = step[18]; - output[19] = step[19]; - - output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); - output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); - output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); - output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); - - output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); - output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); - output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); - output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); - - output[28] = step[28]; - output[29] = step[29]; - output[30] = step[30]; - output[31] = step[31]; - - // dump the magnitude by 4, hence the intermediate values are within - // the range of 16 bits. - if (round) { - output[0] = half_round_shift(output[0]); - output[1] = half_round_shift(output[1]); - output[2] = half_round_shift(output[2]); - output[3] = half_round_shift(output[3]); - output[4] = half_round_shift(output[4]); - output[5] = half_round_shift(output[5]); - output[6] = half_round_shift(output[6]); - output[7] = half_round_shift(output[7]); - output[8] = half_round_shift(output[8]); - output[9] = half_round_shift(output[9]); - output[10] = half_round_shift(output[10]); - output[11] = half_round_shift(output[11]); - output[12] = half_round_shift(output[12]); - output[13] = half_round_shift(output[13]); - output[14] = half_round_shift(output[14]); - output[15] = half_round_shift(output[15]); - - output[16] = half_round_shift(output[16]); - output[17] = half_round_shift(output[17]); - output[18] = half_round_shift(output[18]); - output[19] = half_round_shift(output[19]); - output[20] = half_round_shift(output[20]); - output[21] = half_round_shift(output[21]); - output[22] = half_round_shift(output[22]); - output[23] = half_round_shift(output[23]); - output[24] = half_round_shift(output[24]); - output[25] = half_round_shift(output[25]); - output[26] = half_round_shift(output[26]); - output[27] = half_round_shift(output[27]); - output[28] = half_round_shift(output[28]); - output[29] = half_round_shift(output[29]); - output[30] = half_round_shift(output[30]); - output[31] = half_round_shift(output[31]); - } - - // Stage 3 - step[0] = output[0] + output[(8 - 1)]; - step[1] = output[1] + output[(8 - 2)]; - step[2] = output[2] + output[(8 - 3)]; - step[3] = output[3] + output[(8 - 4)]; - step[4] = -output[4] + output[(8 - 5)]; - step[5] = -output[5] + output[(8 - 6)]; - step[6] = -output[6] + output[(8 - 7)]; - step[7] = -output[7] + output[(8 - 8)]; - step[8] = output[8]; - step[9] = output[9]; - step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); - step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); - step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); - step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); - step[14] = output[14]; - step[15] = output[15]; - - step[16] = output[16] + output[23]; - step[17] = output[17] + output[22]; - step[18] = output[18] + output[21]; - step[19] = output[19] + output[20]; - step[20] = -output[20] + output[19]; - step[21] = -output[21] + output[18]; - step[22] = -output[22] + output[17]; - step[23] = -output[23] + output[16]; - step[24] = -output[24] + output[31]; - step[25] = -output[25] + output[30]; - step[26] = -output[26] + output[29]; - step[27] = -output[27] + output[28]; - step[28] = output[28] + output[27]; - step[29] = output[29] + output[26]; - step[30] = output[30] + output[25]; - step[31] = output[31] + output[24]; - - // Stage 4 - output[0] = step[0] + step[3]; - output[1] = step[1] + step[2]; - output[2] = -step[2] + step[1]; - output[3] = -step[3] + step[0]; - output[4] = step[4]; - output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); - output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); - output[7] = step[7]; - output[8] = step[8] + step[11]; - output[9] = step[9] + step[10]; - output[10] = -step[10] + step[9]; - output[11] = -step[11] + step[8]; - output[12] = -step[12] + step[15]; - output[13] = -step[13] + step[14]; - output[14] = step[14] + step[13]; - output[15] = step[15] + step[12]; - - output[16] = step[16]; - output[17] = step[17]; - output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); - output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); - output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); - output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); - output[22] = step[22]; - output[23] = step[23]; - output[24] = step[24]; - output[25] = step[25]; - output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); - output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); - output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); - output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); - output[30] = step[30]; - output[31] = step[31]; - - // Stage 5 - step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); - step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); - step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); - step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); - step[4] = output[4] + output[5]; - step[5] = -output[5] + output[4]; - step[6] = -output[6] + output[7]; - step[7] = output[7] + output[6]; - step[8] = output[8]; - step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); - step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); - step[11] = output[11]; - step[12] = output[12]; - step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); - step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); - step[15] = output[15]; - - step[16] = output[16] + output[19]; - step[17] = output[17] + output[18]; - step[18] = -output[18] + output[17]; - step[19] = -output[19] + output[16]; - step[20] = -output[20] + output[23]; - step[21] = -output[21] + output[22]; - step[22] = output[22] + output[21]; - step[23] = output[23] + output[20]; - step[24] = output[24] + output[27]; - step[25] = output[25] + output[26]; - step[26] = -output[26] + output[25]; - step[27] = -output[27] + output[24]; - step[28] = -output[28] + output[31]; - step[29] = -output[29] + output[30]; - step[30] = output[30] + output[29]; - step[31] = output[31] + output[28]; - - // Stage 6 - output[0] = step[0]; - output[1] = step[1]; - output[2] = step[2]; - output[3] = step[3]; - output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); - output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); - output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); - output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); - output[8] = step[8] + step[9]; - output[9] = -step[9] + step[8]; - output[10] = -step[10] + step[11]; - output[11] = step[11] + step[10]; - output[12] = step[12] + step[13]; - output[13] = -step[13] + step[12]; - output[14] = -step[14] + step[15]; - output[15] = step[15] + step[14]; - - output[16] = step[16]; - output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); - output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); - output[19] = step[19]; - output[20] = step[20]; - output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); - output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); - output[23] = step[23]; - output[24] = step[24]; - output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); - output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); - output[27] = step[27]; - output[28] = step[28]; - output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); - output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); - output[31] = step[31]; - - // Stage 7 - step[0] = output[0]; - step[1] = output[1]; - step[2] = output[2]; - step[3] = output[3]; - step[4] = output[4]; - step[5] = output[5]; - step[6] = output[6]; - step[7] = output[7]; - step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); - step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); - step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); - step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); - step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); - step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); - step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); - step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); - - step[16] = output[16] + output[17]; - step[17] = -output[17] + output[16]; - step[18] = -output[18] + output[19]; - step[19] = output[19] + output[18]; - step[20] = output[20] + output[21]; - step[21] = -output[21] + output[20]; - step[22] = -output[22] + output[23]; - step[23] = output[23] + output[22]; - step[24] = output[24] + output[25]; - step[25] = -output[25] + output[24]; - step[26] = -output[26] + output[27]; - step[27] = output[27] + output[26]; - step[28] = output[28] + output[29]; - step[29] = -output[29] + output[28]; - step[30] = -output[30] + output[31]; - step[31] = output[31] + output[30]; - - // Final stage --- outputs indices are bit-reversed. - output[0] = step[0]; - output[16] = step[1]; - output[8] = step[2]; - output[24] = step[3]; - output[4] = step[4]; - output[20] = step[5]; - output[12] = step[6]; - output[28] = step[7]; - output[2] = step[8]; - output[18] = step[9]; - output[10] = step[10]; - output[26] = step[11]; - output[6] = step[12]; - output[22] = step[13]; - output[14] = step[14]; - output[30] = step[15]; - - output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); - output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); - output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); - output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); - output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); - output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); - output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); - output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); - output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); - output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); - output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); - output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); - output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); - output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); - output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); - output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); -} - -void aom_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { - int i, j; - tran_high_t output[32 * 32]; - - // Columns - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; - aom_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - } - - // Rows - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; - aom_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - out[j + i * 32] = - (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); - } -} - -// Note that although we use dct_32_round in dct32 computation flow, -// this 2d fdct32x32 for rate-distortion optimization loop is operating -// within 16 bits precision. -void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) { - int i, j; - tran_high_t output[32 * 32]; - - // Columns - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; - aom_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - // TODO(cd): see quality impact of only doing - // output[j * 32 + i] = (temp_out[j] + 1) >> 2; - // PS: also change code in aom_dsp/x86/aom_dct_sse2.c - output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; - } - - // Rows - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32]; - aom_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; - } -} - -#if CONFIG_HIGHBITDEPTH -void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, - int stride) { - aom_fdct4x4_c(input, output, stride); -} - void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { aom_fdct8x8_c(input, final_output, stride); } - -void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, - int stride) { - aom_fdct16x16_c(input, output, stride); -} - -void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) { - aom_fdct32x32_c(input, out, stride); -} -void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, - int stride) { - aom_fdct32x32_rd_c(input, out, stride); -} - -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/fwd_txfm.h b/third_party/aom/aom_dsp/fwd_txfm.h deleted file mode 100644 index f4dc04ab4..000000000 --- a/third_party/aom/aom_dsp/fwd_txfm.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_FWD_TXFM_H_ -#define AOM_DSP_FWD_TXFM_H_ - -#include "aom_dsp/txfm_common.h" - -static INLINE tran_high_t saturate_int16(tran_high_t value) { - tran_high_t result; - result = value > INT16_MAX ? INT16_MAX : value; - return result < INT16_MIN ? INT16_MIN : result; -} - -void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round); -#endif // AOM_DSP_FWD_TXFM_H_ diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c new file mode 100644 index 000000000..fcb6c290e --- /dev/null +++ b/third_party/aom/aom_dsp/grain_synthesis.c @@ -0,0 +1,1392 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes film grain parameters and film grain synthesis + * + */ + +#include +#include +#include +#include "aom_dsp/grain_synthesis.h" +#include "aom_mem/aom_mem.h" + +// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits) +// with zero mean and standard deviation of about 512. +// should be divided by 4 for 10-bit range and 16 for 8-bit range. +static const int gaussian_sequence[2048] = { + 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820, + 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800, + 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588, + -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368, + 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4, + 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396, + 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740, + 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292, + 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532, + 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704, + 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96, + -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244, + 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136, + 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676, + -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400, + -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844, + -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96, + -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356, + 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280, + 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808, + 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228, + -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136, + -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264, + -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388, + 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500, + 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384, + 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220, + -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148, + 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572, + -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516, + 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916, + -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492, + 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560, + -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108, + -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516, + -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88, + -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196, + -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864, + 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920, + 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564, + -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876, + -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244, + 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184, + 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364, + -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72, + 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24, + 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4, + -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120, + 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108, + -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296, + 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336, + -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164, + -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264, + 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536, + -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296, + -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696, + 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204, + 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212, + -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40, + 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384, + 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8, + 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704, + -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348, + -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592, + -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420, + 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220, + -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208, + -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544, + -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288, + -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240, + -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132, + 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16, + -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044, + -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732, + 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460, + -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52, + -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104, + -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460, + 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716, + -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960, + 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476, + 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692, + 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352, + -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144, + -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44, + 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356, + 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452, + -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552, + -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264, + -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448, + -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588, + 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464, + 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216, + 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132, + 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412, + 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48, + 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196, + 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48, + -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292, + 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32, + -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012, + -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120, + -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56, + 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416, + -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404, + -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92, + 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904, + 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728, + 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584, + 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48, + 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180, + 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528, + 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364, + -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260, + -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324, + -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64, + 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120, + -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168, + -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888, + 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588, + -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484, + 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580, + 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392, + 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80, + -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688, + 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4, + -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300, + 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444, + 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192, + 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160, + 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188, + -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404, + -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400, + 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92, + -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824, + 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620, + 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720, + 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620, + -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508, + -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736, + 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836, + 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180, + 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140, + -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32, + -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916, + 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368, + -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380, + -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572, + -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864, + 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908, + -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84, + 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396, + -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360, + 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928, + -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288, + 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196, + 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504, + 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272, + 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344, + -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208, + -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156, + -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240, + -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432, + 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244, + 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584, + 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24, + 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300, + -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416, + 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380, + -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384, + 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88, + 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876, + -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320, + -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88, + -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196, + -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120, + 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664, + -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0, + -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264, + -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288, + -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56, + 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148, + 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156, + -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144, + -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148, + 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944, + 428, -484 +}; + +static const int gauss_bits = 11; + +static int luma_subblock_size_y = 32; +static int luma_subblock_size_x = 32; + +static int chroma_subblock_size_y = 16; +static int chroma_subblock_size_x = 16; + +static const int min_luma_legal_range = 16; +static const int max_luma_legal_range = 235; + +static const int min_chroma_legal_range = 16; +static const int max_chroma_legal_range = 240; + +static int scaling_lut_y[256]; +static int scaling_lut_cb[256]; +static int scaling_lut_cr[256]; + +static int grain_center; +static int grain_min; +static int grain_max; + +static uint16_t random_register = 0; // random number generator register + +static void init_arrays(aom_film_grain_t *params, int luma_stride, + int chroma_stride, int ***pred_pos_luma_p, + int ***pred_pos_chroma_p, int **luma_grain_block, + int **cb_grain_block, int **cr_grain_block, + int **y_line_buf, int **cb_line_buf, int **cr_line_buf, + int **y_col_buf, int **cb_col_buf, int **cr_col_buf, + int luma_grain_samples, int chroma_grain_samples, + int chroma_subsamp_y, int chroma_subsamp_x) { + memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256); + memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256); + memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256); + + int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (params->num_y_points > 0) ++num_pos_chroma; + + int **pred_pos_luma; + int **pred_pos_chroma; + + pred_pos_luma = (int **)aom_malloc(sizeof(*pred_pos_luma) * num_pos_luma); + + for (int row = 0; row < num_pos_luma; row++) { + pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3); + } + + pred_pos_chroma = + (int **)aom_malloc(sizeof(*pred_pos_chroma) * num_pos_chroma); + + for (int row = 0; row < num_pos_chroma; row++) { + pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3); + } + + int pos_ar_index = 0; + + for (int row = -params->ar_coeff_lag; row < 0; row++) { + for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1; + col++) { + pred_pos_luma[pos_ar_index][0] = row; + pred_pos_luma[pos_ar_index][1] = col; + pred_pos_luma[pos_ar_index][2] = 0; + + pred_pos_chroma[pos_ar_index][0] = row; + pred_pos_chroma[pos_ar_index][1] = col; + pred_pos_chroma[pos_ar_index][2] = 0; + ++pos_ar_index; + } + } + + for (int col = -params->ar_coeff_lag; col < 0; col++) { + pred_pos_luma[pos_ar_index][0] = 0; + pred_pos_luma[pos_ar_index][1] = col; + pred_pos_luma[pos_ar_index][2] = 0; + + pred_pos_chroma[pos_ar_index][0] = 0; + pred_pos_chroma[pos_ar_index][1] = col; + pred_pos_chroma[pos_ar_index][2] = 0; + + ++pos_ar_index; + } + + if (params->num_y_points > 0) { + pred_pos_chroma[pos_ar_index][0] = 0; + pred_pos_chroma[pos_ar_index][1] = 0; + pred_pos_chroma[pos_ar_index][2] = 1; + } + + *pred_pos_luma_p = pred_pos_luma; + *pred_pos_chroma_p = pred_pos_chroma; + + *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2); + *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride * + (2 >> chroma_subsamp_y)); + *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride * + (2 >> chroma_subsamp_y)); + + *y_col_buf = + (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2); + *cb_col_buf = + (int *)aom_malloc(sizeof(**cb_col_buf) * + (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) * + (2 >> chroma_subsamp_x)); + *cr_col_buf = + (int *)aom_malloc(sizeof(**cr_col_buf) * + (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) * + (2 >> chroma_subsamp_x)); + + *luma_grain_block = + (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples); + *cb_grain_block = + (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples); + *cr_grain_block = + (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples); +} + +static void dealloc_arrays(aom_film_grain_t *params, int ***pred_pos_luma, + int ***pred_pos_chroma, int **luma_grain_block, + int **cb_grain_block, int **cr_grain_block, + int **y_line_buf, int **cb_line_buf, + int **cr_line_buf, int **y_col_buf, int **cb_col_buf, + int **cr_col_buf) { + int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + int num_pos_chroma = num_pos_luma; + if (params->num_y_points > 0) ++num_pos_chroma; + + for (int row = 0; row < num_pos_luma; row++) { + aom_free((*pred_pos_luma)[row]); + } + aom_free(*pred_pos_luma); + + for (int row = 0; row < num_pos_chroma; row++) { + aom_free((*pred_pos_chroma)[row]); + } + aom_free((*pred_pos_chroma)); + + aom_free(*y_line_buf); + + aom_free(*cb_line_buf); + + aom_free(*cr_line_buf); + + aom_free(*y_col_buf); + + aom_free(*cb_col_buf); + + aom_free(*cr_col_buf); + + aom_free(*luma_grain_block); + + aom_free(*cb_grain_block); + + aom_free(*cr_grain_block); +} + +// get a number between 0 and 2^bits - 1 +static INLINE int get_random_number(int bits) { + uint16_t bit; + bit = ((random_register >> 0) ^ (random_register >> 1) ^ + (random_register >> 3) ^ (random_register >> 12)) & + 1; + random_register = (random_register >> 1) | (bit << 15); + return (random_register >> (16 - bits)) & ((1 << bits) - 1); +} + +static void init_random_generator(int luma_line, uint16_t seed) { + // same for the picture + + uint16_t msb = (seed >> 8) & 255; + uint16_t lsb = seed & 255; + + random_register = (msb << 8) + lsb; + + // changes for each row + int luma_num = luma_line >> 5; + + random_register ^= ((luma_num * 37 + 178) & 255) << 8; + random_register ^= ((luma_num * 173 + 105) & 255); +} + +static void generate_luma_grain_block( + aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, + int luma_block_size_y, int luma_block_size_x, int luma_grain_stride, + int left_pad, int top_pad, int right_pad, int bottom_pad) { + if (params->num_y_points == 0) return; + + int bit_depth = params->bit_depth; + int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; + + int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + int rounding_offset = (1 << (params->ar_coeff_shift - 1)); + + for (int i = 0; i < luma_block_size_y; i++) + for (int j = 0; j < luma_block_size_x; j++) + luma_grain_block[i * luma_grain_stride + j] = + (gaussian_sequence[get_random_number(gauss_bits)] + + ((1 << gauss_sec_shift) >> 1)) >> + gauss_sec_shift; + + for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++) + for (int j = left_pad; j < luma_block_size_x - right_pad; j++) { + int wsum = 0; + for (int pos = 0; pos < num_pos_luma; pos++) { + wsum = wsum + params->ar_coeffs_y[pos] * + luma_grain_block[(i + pred_pos_luma[pos][0]) * + luma_grain_stride + + j + pred_pos_luma[pos][1]]; + } + luma_grain_block[i * luma_grain_stride + j] = + clamp(luma_grain_block[i * luma_grain_stride + j] + + ((wsum + rounding_offset) >> params->ar_coeff_shift), + grain_min, grain_max); + } +} + +static void generate_chroma_grain_blocks( + aom_film_grain_t *params, + // int** pred_pos_luma, + int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block, + int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y, + int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad, + int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) { + int bit_depth = params->bit_depth; + int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; + + int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); + if (params->num_y_points > 0) ++num_pos_chroma; + int rounding_offset = (1 << (params->ar_coeff_shift - 1)); + int chroma_grain_samples = chroma_block_size_y * chroma_block_size_x; + + if (params->num_cb_points || params->chroma_scaling_from_luma) { + init_random_generator(7 << 5, params->random_seed); + + for (int i = 0; i < chroma_block_size_y; i++) + for (int j = 0; j < chroma_block_size_x; j++) + cb_grain_block[i * chroma_grain_stride + j] = + (gaussian_sequence[get_random_number(gauss_bits)] + + ((1 << gauss_sec_shift) >> 1)) >> + gauss_sec_shift; + } else { + memset(cr_grain_block, 0, sizeof(*cr_grain_block) * chroma_grain_samples); + } + + if (params->num_cr_points || params->chroma_scaling_from_luma) { + init_random_generator(11 << 5, params->random_seed); + + for (int i = 0; i < chroma_block_size_y; i++) + for (int j = 0; j < chroma_block_size_x; j++) + cr_grain_block[i * chroma_grain_stride + j] = + (gaussian_sequence[get_random_number(gauss_bits)] + + ((1 << gauss_sec_shift) >> 1)) >> + gauss_sec_shift; + } else { + memset(cb_grain_block, 0, sizeof(*cb_grain_block) * chroma_grain_samples); + } + + for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++) + for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) { + int wsum_cb = 0; + int wsum_cr = 0; + for (int pos = 0; pos < num_pos_chroma; pos++) { + if (pred_pos_chroma[pos][2] == 0) { + wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * + cb_grain_block[(i + pred_pos_chroma[pos][0]) * + chroma_grain_stride + + j + pred_pos_chroma[pos][1]]; + wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * + cr_grain_block[(i + pred_pos_chroma[pos][0]) * + chroma_grain_stride + + j + pred_pos_chroma[pos][1]]; + } else if (pred_pos_chroma[pos][2] == 1) { + int av_luma = 0; + int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad; + int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad; + + for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1; + k++) + for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1; + l++) + av_luma += luma_grain_block[k * luma_grain_stride + l]; + + av_luma = + (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >> + (chroma_subsamp_y + chroma_subsamp_x); + + wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma; + wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma; + } else { + printf( + "Grain synthesis: prediction between two chroma components is " + "not supported!"); + exit(1); + } + } + if (params->num_cb_points || params->chroma_scaling_from_luma) + cb_grain_block[i * chroma_grain_stride + j] = + clamp(cb_grain_block[i * chroma_grain_stride + j] + + ((wsum_cb + rounding_offset) >> params->ar_coeff_shift), + grain_min, grain_max); + if (params->num_cr_points || params->chroma_scaling_from_luma) + cr_grain_block[i * chroma_grain_stride + j] = + clamp(cr_grain_block[i * chroma_grain_stride + j] + + ((wsum_cr + rounding_offset) >> params->ar_coeff_shift), + grain_min, grain_max); + } +} + +static void init_scaling_function(int scaling_points[][2], int num_points, + int scaling_lut[]) { + if (num_points == 0) return; + + for (int i = 0; i < scaling_points[0][0]; i++) + scaling_lut[i] = scaling_points[0][1]; + + for (int point = 0; point < num_points - 1; point++) { + int delta_y = scaling_points[point + 1][1] - scaling_points[point][1]; + int delta_x = scaling_points[point + 1][0] - scaling_points[point][0]; + + int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x); + + for (int x = 0; x < delta_x; x++) { + scaling_lut[scaling_points[point][0] + x] = + scaling_points[point][1] + (int)((x * delta + 32768) >> 16); + } + } + + for (int i = scaling_points[num_points - 1][0]; i < 256; i++) + scaling_lut[i] = scaling_points[num_points - 1][1]; +} + +// function that extracts samples from a LUT (and interpolates intemediate +// frames for 10- and 12-bit video) +static int scale_LUT(int *scaling_lut, int index, int bit_depth) { + int x = index >> (bit_depth - 8); + + if (!(bit_depth - 8) || x == 255) + return scaling_lut[x]; + else + return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) * + (index & ((1 << (bit_depth - 8)) - 1)) + + (1 << (bit_depth - 9))) >> + (bit_depth - 8)); +} + +static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int luma_stride, + int chroma_stride, int *luma_grain, + int *cb_grain, int *cr_grain, + int luma_grain_stride, int chroma_grain_stride, + int half_luma_height, int half_luma_width, + int bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity) { + int cb_mult = params->cb_mult - 128; // fixed scale + int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale + int cb_offset = params->cb_offset - 256; + + int cr_mult = params->cr_mult - 128; // fixed scale + int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale + int cr_offset = params->cr_offset - 256; + + int rounding_offset = (1 << (params->scaling_shift - 1)); + + int apply_y = params->num_y_points > 0 ? 1 : 0; + int apply_cb = + (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0; + int apply_cr = + (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0; + + if (params->chroma_scaling_from_luma) { + cb_mult = 0; // fixed scale + cb_luma_mult = 64; // fixed scale + cb_offset = 0; + + cr_mult = 0; // fixed scale + cr_luma_mult = 64; // fixed scale + cr_offset = 0; + } + + int min_luma, max_luma, min_chroma, max_chroma; + + if (params->clip_to_restricted_range) { + min_luma = min_luma_legal_range; + max_luma = max_luma_legal_range; + + if (mc_identity) { + min_chroma = min_luma_legal_range; + max_chroma = max_luma_legal_range; + } else { + min_chroma = min_chroma_legal_range; + max_chroma = max_chroma_legal_range; + } + } else { + min_luma = min_chroma = 0; + max_luma = max_chroma = 255; + } + + for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) { + for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) { + int average_luma = 0; + if (chroma_subsamp_x) { + average_luma = (luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x)] + + luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x) + 1] + + 1) >> + 1; + } else { + average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j]; + } + + if (apply_cb) { + cb[i * chroma_stride + j] = clamp( + cb[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cb, + clamp(((average_luma * cb_luma_mult + + cb_mult * cb[i * chroma_stride + j]) >> + 6) + + cb_offset, + 0, (256 << (bit_depth - 8)) - 1), + 8) * + cb_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + + if (apply_cr) { + cr[i * chroma_stride + j] = clamp( + cr[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cr, + clamp(((average_luma * cr_luma_mult + + cr_mult * cr[i * chroma_stride + j]) >> + 6) + + cr_offset, + 0, (256 << (bit_depth - 8)) - 1), + 8) * + cr_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + } + } + + if (apply_y) { + for (int i = 0; i < (half_luma_height << 1); i++) { + for (int j = 0; j < (half_luma_width << 1); j++) { + luma[i * luma_stride + j] = + clamp(luma[i * luma_stride + j] + + ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) * + luma_grain[i * luma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_luma, max_luma); + } + } + } +} + +static void add_noise_to_block_hbd( + aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr, + int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain, + int *cr_grain, int luma_grain_stride, int chroma_grain_stride, + int half_luma_height, int half_luma_width, int bit_depth, + int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) { + int cb_mult = params->cb_mult - 128; // fixed scale + int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale + // offset value depends on the bit depth + int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth); + + int cr_mult = params->cr_mult - 128; // fixed scale + int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale + // offset value depends on the bit depth + int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth); + + int rounding_offset = (1 << (params->scaling_shift - 1)); + + int apply_y = params->num_y_points > 0 ? 1 : 0; + int apply_cb = + (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1 + : 0; + int apply_cr = + (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1 + : 0; + + if (params->chroma_scaling_from_luma) { + cb_mult = 0; // fixed scale + cb_luma_mult = 64; // fixed scale + cb_offset = 0; + + cr_mult = 0; // fixed scale + cr_luma_mult = 64; // fixed scale + cr_offset = 0; + } + + int min_luma, max_luma, min_chroma, max_chroma; + + if (params->clip_to_restricted_range) { + min_luma = min_luma_legal_range << (bit_depth - 8); + max_luma = max_luma_legal_range << (bit_depth - 8); + + if (mc_identity) { + min_chroma = min_luma_legal_range << (bit_depth - 8); + max_chroma = max_luma_legal_range << (bit_depth - 8); + } else { + min_chroma = min_chroma_legal_range << (bit_depth - 8); + max_chroma = max_chroma_legal_range << (bit_depth - 8); + } + } else { + min_luma = min_chroma = 0; + max_luma = max_chroma = (256 << (bit_depth - 8)) - 1; + } + + for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) { + for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) { + int average_luma = 0; + if (chroma_subsamp_x) { + average_luma = (luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x)] + + luma[(i << chroma_subsamp_y) * luma_stride + + (j << chroma_subsamp_x) + 1] + + 1) >> + 1; + } else { + average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j]; + } + + if (apply_cb) { + cb[i * chroma_stride + j] = clamp( + cb[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cb, + clamp(((average_luma * cb_luma_mult + + cb_mult * cb[i * chroma_stride + j]) >> + 6) + + cb_offset, + 0, (256 << (bit_depth - 8)) - 1), + bit_depth) * + cb_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + if (apply_cr) { + cr[i * chroma_stride + j] = clamp( + cr[i * chroma_stride + j] + + ((scale_LUT(scaling_lut_cr, + clamp(((average_luma * cr_luma_mult + + cr_mult * cr[i * chroma_stride + j]) >> + 6) + + cr_offset, + 0, (256 << (bit_depth - 8)) - 1), + bit_depth) * + cr_grain[i * chroma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_chroma, max_chroma); + } + } + } + + if (apply_y) { + for (int i = 0; i < (half_luma_height << 1); i++) { + for (int j = 0; j < (half_luma_width << 1); j++) { + luma[i * luma_stride + j] = + clamp(luma[i * luma_stride + j] + + ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], + bit_depth) * + luma_grain[i * luma_grain_stride + j] + + rounding_offset) >> + params->scaling_shift), + min_luma, max_luma); + } + } + } +} + +static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst, + int dst_stride, int width, int height, + int use_high_bit_depth) { + int hbd_coeff = use_high_bit_depth ? 2 : 1; + while (height) { + memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff); + src += src_stride; + dst += dst_stride; + --height; + } + return; +} + +static void copy_area(int *src, int src_stride, int *dst, int dst_stride, + int width, int height) { + while (height) { + memcpy(dst, src, width * sizeof(*src)); + src += src_stride; + dst += dst_stride; + --height; + } + return; +} + +static void extend_even(uint8_t *dst, int dst_stride, int width, int height, + int use_high_bit_depth) { + if ((width & 1) == 0 && (height & 1) == 0) return; + if (use_high_bit_depth) { + uint16_t *dst16 = (uint16_t *)dst; + int dst16_stride = dst_stride / 2; + if (width & 1) { + for (int i = 0; i < height; ++i) + dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1]; + } + width = (width + 1) & (~1); + if (height & 1) { + memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride], + sizeof(*dst16) * width); + } + } else { + if (width & 1) { + for (int i = 0; i < height; ++i) + dst[i * dst_stride + width] = dst[i * dst_stride + width - 1]; + } + width = (width + 1) & (~1); + if (height & 1) { + memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride], + sizeof(*dst) * width); + } + } +} + +static void ver_boundary_overlap(int *left_block, int left_stride, + int *right_block, int right_stride, + int *dst_block, int dst_stride, int width, + int height) { + if (width == 1) { + while (height) { + *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5, + grain_min, grain_max); + left_block += left_stride; + right_block += right_stride; + dst_block += dst_stride; + --height; + } + return; + } else if (width == 2) { + while (height) { + dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5, + grain_min, grain_max); + dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5, + grain_min, grain_max); + left_block += left_stride; + right_block += right_stride; + dst_block += dst_stride; + --height; + } + return; + } +} + +static void hor_boundary_overlap(int *top_block, int top_stride, + int *bottom_block, int bottom_stride, + int *dst_block, int dst_stride, int width, + int height) { + if (height == 1) { + while (width) { + *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5, + grain_min, grain_max); + ++top_block; + ++bottom_block; + ++dst_block; + --width; + } + return; + } else if (height == 2) { + while (width) { + dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5, + grain_min, grain_max); + dst_block[dst_stride] = clamp((17 * top_block[top_stride] + + 27 * bottom_block[bottom_stride] + 16) >> + 5, + grain_min, grain_max); + ++top_block; + ++bottom_block; + ++dst_block; + --width; + } + return; + } +} + +void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src, + aom_image_t *dst) { + uint8_t *luma, *cb, *cr; + int height, width, luma_stride, chroma_stride; + int use_high_bit_depth = 0; + int chroma_subsamp_x = 0; + int chroma_subsamp_y = 0; + int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0; + + switch (src->fmt) { + case AOM_IMG_FMT_AOMI420: + case AOM_IMG_FMT_I420: + use_high_bit_depth = 0; + chroma_subsamp_x = 1; + chroma_subsamp_y = 1; + break; + case AOM_IMG_FMT_I42016: + use_high_bit_depth = 1; + chroma_subsamp_x = 1; + chroma_subsamp_y = 1; + break; + // case AOM_IMG_FMT_444A: + case AOM_IMG_FMT_I444: + use_high_bit_depth = 0; + chroma_subsamp_x = 0; + chroma_subsamp_y = 0; + break; + case AOM_IMG_FMT_I44416: + use_high_bit_depth = 1; + chroma_subsamp_x = 0; + chroma_subsamp_y = 0; + break; + case AOM_IMG_FMT_I422: + use_high_bit_depth = 0; + chroma_subsamp_x = 1; + chroma_subsamp_y = 0; + break; + case AOM_IMG_FMT_I42216: + use_high_bit_depth = 1; + chroma_subsamp_x = 1; + chroma_subsamp_y = 0; + break; + default: // unknown input format + printf("Film grain error: input format is not supported!"); + exit(1); + } + + dst->r_w = src->r_w; + dst->r_h = src->r_h; + dst->d_w = src->d_w; + dst->d_h = src->d_h; + + dst->cp = src->cp; + dst->tc = src->tc; + dst->mc = src->mc; + + dst->monochrome = src->monochrome; + dst->csp = src->csp; + dst->range = src->range; + + dst->x_chroma_shift = src->x_chroma_shift; + dst->y_chroma_shift = src->y_chroma_shift; + + dst->temporal_id = src->temporal_id; + dst->spatial_id = src->spatial_id; + + width = src->d_w % 2 ? src->d_w + 1 : src->d_w; + height = src->d_h % 2 ? src->d_h + 1 : src->d_h; + + copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y], + dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w, + src->d_h, use_high_bit_depth); + // Note that dst is already assumed to be aligned to even. + extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w, + src->d_h, use_high_bit_depth); + + if (!src->monochrome) { + copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U], + dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U], + width >> chroma_subsamp_x, height >> chroma_subsamp_y, + use_high_bit_depth); + + copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V], + dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V], + width >> chroma_subsamp_x, height >> chroma_subsamp_y, + use_high_bit_depth); + } + + luma = dst->planes[AOM_PLANE_Y]; + cb = dst->planes[AOM_PLANE_U]; + cr = dst->planes[AOM_PLANE_V]; + + // luma and chroma strides in samples + luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth; + chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth; + + params->bit_depth = dst->bit_depth; + + av1_add_film_grain_run(params, luma, cb, cr, height, width, luma_stride, + chroma_stride, use_high_bit_depth, chroma_subsamp_y, + chroma_subsamp_x, mc_identity); + return; +} + +void av1_add_film_grain_run(aom_film_grain_t *params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int height, int width, + int luma_stride, int chroma_stride, + int use_high_bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity) { + int **pred_pos_luma; + int **pred_pos_chroma; + int *luma_grain_block; + int *cb_grain_block; + int *cr_grain_block; + + int *y_line_buf; + int *cb_line_buf; + int *cr_line_buf; + + int *y_col_buf; + int *cb_col_buf; + int *cr_col_buf; + + random_register = params->random_seed; + + int left_pad = 3; + int right_pad = 3; // padding to offset for AR coefficients + int top_pad = 3; + int bottom_pad = 0; + + int ar_padding = 3; // maximum lag used for stabilization of AR coefficients + + luma_subblock_size_y = 32; + luma_subblock_size_x = 32; + + chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y; + chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x; + + // Initial padding is only needed for generation of + // film grain templates (to stabilize the AR process) + // Only a 64x64 luma and 32x32 chroma part of a template + // is used later for adding grain, padding can be discarded + + int luma_block_size_y = + top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad; + int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 + + 2 * ar_padding + right_pad; + + int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding + + chroma_subblock_size_y * 2 + bottom_pad; + int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding + + chroma_subblock_size_x * 2 + + (2 >> chroma_subsamp_x) * ar_padding + right_pad; + + int luma_grain_stride = luma_block_size_x; + int chroma_grain_stride = chroma_block_size_x; + + int overlap = params->overlap_flag; + int bit_depth = params->bit_depth; + + grain_center = 128 << (bit_depth - 8); + grain_min = 0 - grain_center; + grain_max = (256 << (bit_depth - 8)) - 1 - grain_center; + + init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma, + &pred_pos_chroma, &luma_grain_block, &cb_grain_block, + &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf, + &y_col_buf, &cb_col_buf, &cr_col_buf, + luma_block_size_y * luma_block_size_x, + chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y, + chroma_subsamp_x); + + generate_luma_grain_block(params, pred_pos_luma, luma_grain_block, + luma_block_size_y, luma_block_size_x, + luma_grain_stride, left_pad, top_pad, right_pad, + bottom_pad); + + generate_chroma_grain_blocks( + params, + // pred_pos_luma, + pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block, + luma_grain_stride, chroma_block_size_y, chroma_block_size_x, + chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad, + chroma_subsamp_y, chroma_subsamp_x); + + init_scaling_function(params->scaling_points_y, params->num_y_points, + scaling_lut_y); + + if (params->chroma_scaling_from_luma) { + memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256); + memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256); + } else { + init_scaling_function(params->scaling_points_cb, params->num_cb_points, + scaling_lut_cb); + init_scaling_function(params->scaling_points_cr, params->num_cr_points, + scaling_lut_cr); + } + for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) { + init_random_generator(y * 2, params->random_seed); + + for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) { + int offset_y = get_random_number(8); + int offset_x = (offset_y >> 4) & 15; + offset_y &= 15; + + int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1); + int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1); + + int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding + + offset_y * (2 >> chroma_subsamp_y); + int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding + + offset_x * (2 >> chroma_subsamp_x); + + if (overlap && x) { + ver_boundary_overlap( + y_col_buf, 2, + luma_grain_block + luma_offset_y * luma_grain_stride + + luma_offset_x, + luma_grain_stride, y_col_buf, 2, 2, + AOMMIN(luma_subblock_size_y + 2, height - (y << 1))); + + ver_boundary_overlap( + cb_col_buf, 2 >> chroma_subsamp_x, + cb_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x, + chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + + ver_boundary_overlap( + cr_col_buf, 2 >> chroma_subsamp_x, + cr_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x, + chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + + int i = y ? 1 : 0; + + if (use_high_bit_depth) { + add_noise_to_block_hbd( + params, + (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1), + (uint16_t *)cb + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + (uint16_t *)cr + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, y_col_buf + i * 4, + cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + 2, (2 - chroma_subsamp_x), + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1, + bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } else { + add_noise_to_block( + params, luma + ((y + i) << 1) * luma_stride + (x << 1), + cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + (x << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, y_col_buf + i * 4, + cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x), + 2, (2 - chroma_subsamp_x), + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1, + bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } + } + + if (overlap && y) { + if (x) { + hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2, + y_line_buf + (x << 1), luma_stride, 2, 2); + + hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x, + cb_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_y); + + hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x, + cr_line_buf + x * (2 >> chroma_subsamp_x), + chroma_stride, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_y); + } + + hor_boundary_overlap( + y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, + luma_grain_block + luma_offset_y * luma_grain_stride + + luma_offset_x + (x ? 2 : 0), + luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, + AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1), + width - ((x ? x + 1 : 0) << 1)), + 2); + + hor_boundary_overlap( + cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + cb_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + chroma_grain_stride, + cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x - + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x), + 2 >> chroma_subsamp_y); + + hor_boundary_overlap( + cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + cr_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + chroma_grain_stride, + cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x - + ((x ? 1 : 0) << (1 - chroma_subsamp_x)), + (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x), + 2 >> chroma_subsamp_y); + + if (use_high_bit_depth) { + add_noise_to_block_hbd( + params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1), + (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + luma_stride, chroma_stride, y_line_buf + (x << 1), + cb_line_buf + (x << (1 - chroma_subsamp_x)), + cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride, + chroma_stride, 1, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } else { + add_noise_to_block( + params, luma + (y << 1) * luma_stride + (x << 1), + cb + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + cr + (y << (1 - chroma_subsamp_y)) * chroma_stride + + (x << ((1 - chroma_subsamp_x))), + luma_stride, chroma_stride, y_line_buf + (x << 1), + cb_line_buf + (x << (1 - chroma_subsamp_x)), + cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride, + chroma_stride, 1, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } + } + + int i = overlap && y ? 1 : 0; + int j = overlap && x ? 1 : 0; + + if (use_high_bit_depth) { + add_noise_to_block_hbd( + params, + (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1), + (uint16_t *)cb + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + (uint16_t *)cr + + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, + luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride + + luma_offset_x + (j << 1), + cb_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + cr_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + luma_grain_stride, chroma_grain_stride, + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } else { + add_noise_to_block( + params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1), + cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride + + ((x + j) << (1 - chroma_subsamp_x)), + luma_stride, chroma_stride, + luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride + + luma_offset_x + (j << 1), + cb_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + cr_grain_block + + (chroma_offset_y + (i << (1 - chroma_subsamp_y))) * + chroma_grain_stride + + chroma_offset_x + (j << (1 - chroma_subsamp_x)), + luma_grain_stride, chroma_grain_stride, + AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, + AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth, + chroma_subsamp_y, chroma_subsamp_x, mc_identity); + } + + if (overlap) { + if (x) { + // Copy overlapped column bufer to line buffer + copy_area(y_col_buf + (luma_subblock_size_y << 1), 2, + y_line_buf + (x << 1), luma_stride, 2, 2); + + copy_area( + cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)), + 2 >> chroma_subsamp_x, + cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride, + 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); + + copy_area( + cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)), + 2 >> chroma_subsamp_x, + cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride, + 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y); + } + + // Copy grain to the line buffer for overlap with a bottom block + copy_area( + luma_grain_block + + (luma_offset_y + luma_subblock_size_y) * luma_grain_stride + + luma_offset_x + ((x ? 2 : 0)), + luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride, + AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2); + + copy_area(cb_grain_block + + (chroma_offset_y + chroma_subblock_size_y) * + chroma_grain_stride + + chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0), + chroma_grain_stride, + cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x, + ((width - (x << 1)) >> chroma_subsamp_x)) - + (x ? 2 >> chroma_subsamp_x : 0), + 2 >> chroma_subsamp_y); + + copy_area(cr_grain_block + + (chroma_offset_y + chroma_subblock_size_y) * + chroma_grain_stride + + chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0), + chroma_grain_stride, + cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)), + chroma_stride, + AOMMIN(chroma_subblock_size_x, + ((width - (x << 1)) >> chroma_subsamp_x)) - + (x ? 2 >> chroma_subsamp_x : 0), + 2 >> chroma_subsamp_y); + + // Copy grain to the column buffer for overlap with the next block to + // the right + + copy_area(luma_grain_block + luma_offset_y * luma_grain_stride + + luma_offset_x + luma_subblock_size_x, + luma_grain_stride, y_col_buf, 2, 2, + AOMMIN(luma_subblock_size_y + 2, height - (y << 1))); + + copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + chroma_subblock_size_x, + chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + + copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride + + chroma_offset_x + chroma_subblock_size_x, + chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x, + 2 >> chroma_subsamp_x, + AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y), + (height - (y << 1)) >> chroma_subsamp_y)); + } + } + } + + dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block, + &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf, + &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf); +} diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h new file mode 100644 index 000000000..016cb12d7 --- /dev/null +++ b/third_party/aom/aom_dsp/grain_synthesis.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief Describes film grain parameters and film grain synthesis + * + */ +#ifndef AOM_AOM_GRAIN_SYNTHESIS_H_ +#define AOM_AOM_GRAIN_SYNTHESIS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom_dsp/aom_dsp_common.h" +#include "aom/aom_image.h" + +/*!\brief Structure containing film grain synthesis parameters for a frame + * + * This structure contains input parameters for film grain synthesis + */ +typedef struct { + int apply_grain; + + int update_parameters; + + // 8 bit values + int scaling_points_y[14][2]; + int num_y_points; // value: 0..14 + + // 8 bit values + int scaling_points_cb[10][2]; + int num_cb_points; // value: 0..10 + + // 8 bit values + int scaling_points_cr[10][2]; + int num_cr_points; // value: 0..10 + + int scaling_shift; // values : 8..11 + + int ar_coeff_lag; // values: 0..3 + + // 8 bit values + int ar_coeffs_y[24]; + int ar_coeffs_cb[25]; + int ar_coeffs_cr[25]; + + // Shift value: AR coeffs range + // 6: [-2, 2) + // 7: [-1, 1) + // 8: [-0.5, 0.5) + // 9: [-0.25, 0.25) + int ar_coeff_shift; // values : 6..9 + + int cb_mult; // 8 bits + int cb_luma_mult; // 8 bits + int cb_offset; // 9 bits + + int cr_mult; // 8 bits + int cr_luma_mult; // 8 bits + int cr_offset; // 9 bits + + int overlap_flag; + + int clip_to_restricted_range; + + int bit_depth; // video bit depth + + int chroma_scaling_from_luma; + + int grain_scale_shift; + + uint16_t random_seed; +} aom_film_grain_t; + +/*!\brief Add film grain + * + * Add film grain to an image + * + * \param[in] grain_params Grain parameters + * \param[in] luma luma plane + * \param[in] cb cb plane + * \param[in] cr cr plane + * \param[in] height luma plane height + * \param[in] width luma plane width + * \param[in] luma_stride luma plane stride + * \param[in] chroma_stride chroma plane stride + */ +void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int height, int width, + int luma_stride, int chroma_stride, + int use_high_bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity); + +/*!\brief Add film grain + * + * Add film grain to an image + * + * \param[in] grain_params Grain parameters + * \param[in] src Source image + * \param[in] dst Resulting image with grain + */ +void av1_add_film_grain(aom_film_grain_t *grain_params, aom_image_t *src, + aom_image_t *dst); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AOM_GRAIN_SYNTHESIS_H_ diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c new file mode 100644 index 000000000..0d6a73f55 --- /dev/null +++ b/third_party/aom/aom_dsp/grain_table.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief This file has the implementation details of the grain table. + * + * The file format is an ascii representation for readability and + * editability. Array parameters are separated from the non-array + * parameters and prefixed with a few characters to make for easy + * localization with a parameter set. Each entry is prefixed with "E" + * and the other parameters are only specified if "update-parms" is + * non-zero. + * + * filmgrn1 + * E + * p ... + * sY ... + * sCb ... + * sCr ... + * cY .... + * cCb .... + * cCr .... + * E ... + */ +#include +#include +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/grain_table.h" +#include "aom_mem/aom_mem.h" + +static const char kFileMagic[8] = "filmgrn1"; + +static void grain_table_entry_read(FILE *file, + struct aom_internal_error_info *error_info, + aom_film_grain_table_entry_t *entry) { + aom_film_grain_t *pars = &entry->params; + int num_read = + fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time, + &entry->end_time, &pars->apply_grain, &pars->random_seed, + &pars->update_parameters); + if (num_read == 0 && feof(file)) return; + if (num_read != 5) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read entry header. Read %d != 5", num_read); + return; + } + if (pars->update_parameters) { + num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n", + &pars->ar_coeff_lag, &pars->ar_coeff_shift, + &pars->grain_scale_shift, &pars->scaling_shift, + &pars->chroma_scaling_from_luma, &pars->overlap_flag, + &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset, + &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset); + if (num_read != 12) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read entry params. Read %d != 12", + num_read); + return; + } + if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read num y points"); + return; + } + for (int i = 0; i < pars->num_y_points; ++i) { + if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0], + &pars->scaling_points_y[i][1])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read y scaling points"); + return; + } + } + if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read num cb points"); + return; + } + for (int i = 0; i < pars->num_cb_points; ++i) { + if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0], + &pars->scaling_points_cb[i][1])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read cb scaling points"); + return; + } + } + if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read num cr points"); + return; + } + for (int i = 0; i < pars->num_cr_points; ++i) { + if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0], + &pars->scaling_points_cr[i][1])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read cr scaling points"); + return; + } + } + + fscanf(file, "\n\tcY"); + const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + for (int i = 0; i < n; ++i) { + if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Y coeffs"); + return; + } + } + fscanf(file, "\n\tcCb"); + for (int i = 0; i <= n; ++i) { + if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Cb coeffs"); + return; + } + } + fscanf(file, "\n\tcCr"); + for (int i = 0; i <= n; ++i) { + if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read Cr coeffs"); + return; + } + } + fscanf(file, "\n"); + } +} + +void grain_table_entry_write(FILE *file, aom_film_grain_table_entry_t *entry) { + const aom_film_grain_t *pars = &entry->params; + fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time, + entry->end_time, pars->apply_grain, pars->random_seed, + pars->update_parameters); + if (pars->update_parameters) { + fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n", + pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift, + pars->scaling_shift, pars->chroma_scaling_from_luma, + pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult, + pars->cb_offset, pars->cr_mult, pars->cr_luma_mult, + pars->cr_offset); + fprintf(file, "\tsY %d ", pars->num_y_points); + for (int i = 0; i < pars->num_y_points; ++i) { + fprintf(file, " %d %d", pars->scaling_points_y[i][0], + pars->scaling_points_y[i][1]); + } + fprintf(file, "\n\tsCb %d", pars->num_cb_points); + for (int i = 0; i < pars->num_cb_points; ++i) { + fprintf(file, " %d %d", pars->scaling_points_cb[i][0], + pars->scaling_points_cb[i][1]); + } + fprintf(file, "\n\tsCr %d", pars->num_cr_points); + for (int i = 0; i < pars->num_cr_points; ++i) { + fprintf(file, " %d %d", pars->scaling_points_cr[i][0], + pars->scaling_points_cr[i][1]); + } + fprintf(file, "\n\tcY"); + const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1); + for (int i = 0; i < n; ++i) { + fprintf(file, " %d", pars->ar_coeffs_y[i]); + } + fprintf(file, "\n\tcCb"); + for (int i = 0; i <= n; ++i) { + fprintf(file, " %d", pars->ar_coeffs_cb[i]); + } + fprintf(file, "\n\tcCr"); + for (int i = 0; i <= n; ++i) { + fprintf(file, " %d", pars->ar_coeffs_cr[i]); + } + fprintf(file, "\n"); + } +} + +void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp, + int64_t end_time, + const aom_film_grain_t *grain) { + if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) { + aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail)); + memset(new_tail, 0, sizeof(*new_tail)); + if (t->tail) t->tail->next = new_tail; + if (!t->head) t->head = new_tail; + t->tail = new_tail; + + new_tail->start_time = time_stamp; + new_tail->end_time = end_time; + new_tail->params = *grain; + } else { + t->tail->end_time = AOMMAX(t->tail->end_time, end_time); + t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp); + } +} + +int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, + int64_t end_time, int erase, + aom_film_grain_t *grain) { + aom_film_grain_table_entry_t *entry = t->head; + aom_film_grain_table_entry_t *prev_entry = 0; + int16_t random_seed = grain ? grain->random_seed : 0; + if (grain) memset(grain, 0, sizeof(*grain)); + + while (entry) { + aom_film_grain_table_entry_t *next = entry->next; + if (time_stamp >= entry->start_time && time_stamp < entry->end_time) { + if (grain) { + *grain = entry->params; + if (time_stamp != 0) grain->random_seed = random_seed; + } + if (!erase) return 1; + + const int64_t entry_end_time = entry->end_time; + if (time_stamp <= entry->start_time && end_time >= entry->end_time) { + if (t->tail == entry) t->tail = prev_entry; + if (prev_entry) { + prev_entry->next = entry->next; + } else { + t->head = entry->next; + } + aom_free(entry); + } else if (time_stamp <= entry->start_time && + end_time < entry->end_time) { + entry->start_time = end_time; + } else if (time_stamp > entry->start_time && + end_time >= entry->end_time) { + entry->end_time = time_stamp; + } else { + aom_film_grain_table_entry_t *new_entry = + aom_malloc(sizeof(*new_entry)); + new_entry->next = entry->next; + new_entry->start_time = end_time; + new_entry->end_time = entry->end_time; + new_entry->params = entry->params; + entry->next = new_entry; + entry->end_time = time_stamp; + if (t->tail == entry) t->tail = new_entry; + } + // If segments aren't aligned, delete from the beggining of subsequent + // segments + if (end_time > entry_end_time) { + aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0); + } + return 1; + } + prev_entry = entry; + entry = next; + } + return 0; +} + +aom_codec_err_t aom_film_grain_table_read( + aom_film_grain_table_t *t, const char *filename, + struct aom_internal_error_info *error_info) { + FILE *file = fopen(filename, "rb"); + if (!file) { + aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s", + filename); + return error_info->error_code; + } + error_info->error_code = AOM_CODEC_OK; + + // Read in one extra character as there should be white space after + // the header. + char magic[9]; + if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to read (or invalid) file magic"); + fclose(file); + return error_info->error_code; + } + + aom_film_grain_table_entry_t *prev_entry = 0; + while (!feof(file)) { + aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry)); + memset(entry, 0, sizeof(*entry)); + grain_table_entry_read(file, error_info, entry); + entry->next = 0; + + if (prev_entry) prev_entry->next = entry; + if (!t->head) t->head = entry; + t->tail = entry; + prev_entry = entry; + + if (error_info->error_code != AOM_CODEC_OK) break; + } + + fclose(file); + return error_info->error_code; +} + +aom_codec_err_t aom_film_grain_table_write( + const aom_film_grain_table_t *t, const char *filename, + struct aom_internal_error_info *error_info) { + error_info->error_code = AOM_CODEC_OK; + + FILE *file = fopen(filename, "wb"); + if (!file) { + aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s", + filename); + return error_info->error_code; + } + + if (!fwrite(kFileMagic, 8, 1, file)) { + aom_internal_error(error_info, AOM_CODEC_ERROR, + "Unable to write file magic"); + fclose(file); + return error_info->error_code; + } + + fprintf(file, "\n"); + aom_film_grain_table_entry_t *entry = t->head; + while (entry) { + grain_table_entry_write(file, entry); + entry = entry->next; + } + fclose(file); + return error_info->error_code; +} + +void aom_film_grain_table_free(aom_film_grain_table_t *t) { + aom_film_grain_table_entry_t *entry = t->head; + while (entry) { + aom_film_grain_table_entry_t *next = entry->next; + aom_free(entry); + entry = next; + } + memset(t, 0, sizeof(*t)); +} diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h new file mode 100644 index 000000000..5c20413b2 --- /dev/null +++ b/third_party/aom/aom_dsp/grain_table.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/*!\file + * \brief A table mapping from time to corresponding film grain parameters. + * + * In order to apply grain synthesis in the decoder, the film grain parameters + * need to be signalled in the encoder. The film grain parameters are time + * varying, and for two-pass encoding (and denoiser implementation flexibility) + * it is common to denoise the video and do parameter estimation before encoding + * the denoised video. + * + * The film grain table is used to provide this flexibility and is used as a + * parameter that is passed to the encoder. + * + * Further, if regraining is to be done in say a single pass mode, or in two + * pass within the encoder (before frames are added to the lookahead buffer), + * this data structure can be used to keep track of on-the-fly estimated grain + * parameters, that are then extracted from the table before the encoded frame + * is written. + */ +#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_ +#define AOM_AOM_DSP_GRAIN_TABLE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "aom_dsp/grain_synthesis.h" +#include "aom/internal/aom_codec_internal.h" + +typedef struct aom_film_grain_table_entry_t { + aom_film_grain_t params; + int64_t start_time; + int64_t end_time; + struct aom_film_grain_table_entry_t *next; +} aom_film_grain_table_entry_t; + +typedef struct { + aom_film_grain_table_entry_t *head; + aom_film_grain_table_entry_t *tail; +} aom_film_grain_table_t; + +/*!\brief Add a mapping from [time_stamp, end_time) to the given grain + * parameters + * + * \param[in/out] table The grain table + * \param[in] time_stamp The start time stamp + * \param[in] end_stamp The end time_stamp + * \param[in] grain The grain parameters + */ +void aom_film_grain_table_append(aom_film_grain_table_t *table, + int64_t time_stamp, int64_t end_time, + const aom_film_grain_t *grain); + +/*!\brief Look-up (and optionally erase) the grain parameters for the given time + * + * \param[in] table The grain table + * \param[in] time_stamp The start time stamp + * \param[in] end_stamp The end time_stamp + * \param[in] erase Whether the time segment can be deleted + * \param[out] grain The output grain parameters + */ +int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp, + int64_t end_time, int erase, + aom_film_grain_t *grain); + +/*!\brief Reads the grain table from a file. + * + * \param[out] table The grain table + * \param[in] filename The file to read from + * \param[in] error_info Error info for tracking errors + */ +aom_codec_err_t aom_film_grain_table_read( + aom_film_grain_table_t *table, const char *filename, + struct aom_internal_error_info *error_info); + +/*!\brief Writes the grain table from a file. + * + * \param[out] table The grain table + * \param[in] filename The file to read from + * \param[in] error_info Error info for tracking errors + */ +aom_codec_err_t aom_film_grain_table_write( + const aom_film_grain_table_t *t, const char *filename, + struct aom_internal_error_info *error_info); + +void aom_film_grain_table_free(aom_film_grain_table_t *t); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c index 6d2ac37d9..c6aa6b207 100644 --- a/third_party/aom/aom_dsp/intrapred.c +++ b/third_party/aom/aom_dsp/intrapred.c @@ -12,152 +12,14 @@ #include #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/intrapred_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" -#define DST(x, y) dst[(x) + (y)*stride] -#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) -#define AVG2(a, b) (((a) + (b) + 1) >> 1) - -static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - int r, c; - (void)above; - - for (r = 0; r < bh; ++r) { - for (c = 0; c < bw; ++c) { - dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], - left[(c >> 1) + r + 2]) - : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); - } - dst += stride; - } -} - -static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - int r, c; - (void)left; - for (r = 0; r < bh; ++r) { - for (c = 0; c < bw; ++c) { - dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], - above[(r >> 1) + c + 2]) - : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); - } - dst += stride; - } -} - -static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - int r, c; - (void)left; - for (r = 0; r < bh; ++r) { - for (c = 0; c < bw; ++c) { - dst[c] = AVG3(above[r + c], above[r + c + 1], - above[r + c + 1 + (r + c + 2 < bw + bh)]); - } - dst += stride; - } -} - -static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - int r, c; - - // first row - for (c = 0; c < bw; c++) dst[c] = AVG2(above[c - 1], above[c]); - dst += stride; - - // second row - dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); - dst += stride; - - // the rest of first col - dst[0] = AVG3(above[-1], left[0], left[1]); - for (r = 3; r < bh; ++r) - dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); - - // the rest of the block - for (r = 2; r < bh; ++r) { - for (c = 1; c < bw; c++) dst[c] = dst[-2 * stride + c - 1]; - dst += stride; - } -} - -static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - int i; -#if CONFIG_TX64X64 -#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 - // silence a spurious -Warray-bounds warning, possibly related to: - // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 - uint8_t border[133]; -#else - uint8_t border[64 + 64 - 1]; // outer border from bottom-left to top-right -#endif -#else -#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7 - // silence a spurious -Warray-bounds warning, possibly related to: - // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273 - uint8_t border[69]; -#else - uint8_t border[32 + 32 - 1]; // outer border from bottom-left to top-right -#endif -#endif // CONFIG_TX64X64 - - // dst(bh, bh - 2)[0], i.e., border starting at bottom-left - for (i = 0; i < bh - 2; ++i) { - border[i] = AVG3(left[bh - 3 - i], left[bh - 2 - i], left[bh - 1 - i]); - } - border[bh - 2] = AVG3(above[-1], left[0], left[1]); - border[bh - 1] = AVG3(left[0], above[-1], above[0]); - border[bh - 0] = AVG3(above[-1], above[0], above[1]); - // dst[0][2, size), i.e., remaining top border ascending - for (i = 0; i < bw - 2; ++i) { - border[bh + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]); - } - - for (i = 0; i < bh; ++i) { - memcpy(dst + i * stride, border + bh - 1 - i, bw); - } -} - -static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bw, - int bh, const uint8_t *above, - const uint8_t *left) { - int r, c; - dst[0] = AVG2(above[-1], left[0]); - for (r = 1; r < bh; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); - dst++; - - dst[0] = AVG3(left[0], above[-1], above[0]); - dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bh; r++) - dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); - dst++; - - for (c = 0; c < bw - 2; c++) - dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); - dst += stride; - - for (r = 1; r < bh; ++r) { - for (c = 0; c < bw - 2; c++) dst[c] = dst[-stride + c - 2]; - dst += stride; - } -} - static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { int r; @@ -244,13 +106,12 @@ static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw, for (i = 0; i < 4; ++i) { this_pred += weights[i] * pixels[i]; } - dst[c] = clip_pixel(divide_round(this_pred, log2_scale)); + dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } } -#if CONFIG_SMOOTH_HV static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left) { @@ -274,7 +135,7 @@ static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, for (i = 0; i < 2; ++i) { this_pred += weights[i] * pixels[i]; } - dst[c] = clip_pixel(divide_round(this_pred, log2_scale)); + dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } @@ -303,12 +164,11 @@ static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, for (i = 0; i < 2; ++i) { this_pred += weights[i] * pixels[i]; } - dst[c] = clip_pixel(divide_round(this_pred, log2_scale)); + dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } } -#endif // CONFIG_SMOOTH_HV static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, @@ -373,267 +233,133 @@ static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh, } } -void aom_d45e_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - const int D = above[3]; - (void)stride; - (void)left; - - DST(0, 0) = AVG3(A, B, C); - DST(1, 0) = DST(0, 1) = AVG3(B, C, D); - DST(1, 1) = AVG3(C, D, D); +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier, int shift2) { + const int interm = num >> shift1; + return interm * multiplier >> shift2; } -void aom_d117_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int I = left[0]; - const int X = above[-1]; - const int A = above[0]; - const int B = above[1]; - DST(0, 0) = AVG2(X, A); - DST(1, 0) = AVG2(A, B); - DST(0, 1) = AVG3(I, X, A); - DST(1, 1) = AVG3(X, A, B); -} + // The constants (multiplier and shifts) for a given block size are obtained + // as follows: + // - Let sum_w_h = block width + block height. + // - Shift 'sum_w_h' right until we reach an odd number. Let the number of + // shifts for that block size be called 'shift1' (see the parameter in + // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2 + // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect + // block]. + // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5, + // using the "Algorithm 1" in: + // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632 + // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd + // shift will be 16, regardless of the block size. -void aom_d135_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int I = left[0]; - const int J = left[1]; - const int X = above[-1]; - const int A = above[0]; - const int B = above[1]; - (void)stride; - DST(0, 1) = AVG3(X, I, J); - DST(1, 1) = DST(0, 0) = AVG3(A, X, I); - DST(1, 0) = AVG3(B, A, X); -} + // Note: For low bitdepth, assembly code may be optimized by using smaller + // constants for smaller block sizes, where the range of the 'sum' is + // restricted to fewer bits. -void aom_d153_predictor_2x2_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int I = left[0]; - const int J = left[1]; - const int X = above[-1]; - const int A = above[0]; - - DST(0, 0) = AVG2(I, X); - DST(0, 1) = AVG2(J, I); - DST(1, 0) = AVG3(I, X, A); - DST(1, 1) = AVG3(J, I, X); -} +#define DC_MULTIPLIER_1X2 0x5556 +#define DC_MULTIPLIER_1X4 0x3334 -void aom_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - const int D = above[3]; - const int E = above[4]; - const int F = above[5]; - const int G = above[6]; - const int H = above[7]; - (void)stride; - (void)left; - DST(0, 0) = AVG3(A, B, C); - DST(1, 0) = DST(0, 1) = AVG3(B, C, D); - DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); - DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); - DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); - DST(3, 2) = DST(2, 3) = AVG3(F, G, H); - DST(3, 3) = AVG3(G, H, H); -} +#define DC_SHIFT2 16 -void aom_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int X = above[-1]; - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - const int D = above[3]; - DST(0, 0) = DST(1, 2) = AVG2(X, A); - DST(1, 0) = DST(2, 2) = AVG2(A, B); - DST(2, 0) = DST(3, 2) = AVG2(B, C); - DST(3, 0) = AVG2(C, D); - - DST(0, 3) = AVG3(K, J, I); - DST(0, 2) = AVG3(J, I, X); - DST(0, 1) = DST(1, 3) = AVG3(I, X, A); - DST(1, 1) = DST(2, 3) = AVG3(X, A, B); - DST(2, 1) = DST(3, 3) = AVG3(A, B, C); - DST(3, 1) = AVG3(B, C, D); -} +static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw, + int bh, const uint8_t *above, + const uint8_t *left, int shift1, + int multiplier) { + int sum = 0; -void aom_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - const int X = above[-1]; - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - const int D = above[3]; - (void)stride; - DST(0, 3) = AVG3(J, K, L); - DST(1, 3) = DST(0, 2) = AVG3(I, J, K); - DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); - DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); - DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); - DST(3, 1) = DST(2, 0) = AVG3(C, B, A); - DST(3, 0) = AVG3(D, C, B); + for (int i = 0; i < bw; i++) { + sum += above[i]; + } + for (int i = 0; i < bh; i++) { + sum += left[i]; + } + + const int expected_dc = divide_using_multiply_shift( + sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2); + assert(expected_dc < (1 << 8)); + + for (int r = 0; r < bh; r++) { + memset(dst, expected_dc, bw); + dst += stride; + } } -void aom_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - const int X = above[-1]; - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - - DST(0, 0) = DST(2, 1) = AVG2(I, X); - DST(0, 1) = DST(2, 2) = AVG2(J, I); - DST(0, 2) = DST(2, 3) = AVG2(K, J); - DST(0, 3) = AVG2(L, K); - - DST(3, 0) = AVG3(A, B, C); - DST(2, 0) = AVG3(X, A, B); - DST(1, 0) = DST(3, 1) = AVG3(I, X, A); - DST(1, 1) = DST(3, 2) = AVG3(J, I, X); - DST(1, 2) = DST(3, 3) = AVG3(K, J, I); - DST(1, 3) = AVG3(L, K, J); +#undef DC_SHIFT2 + +void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2); } -#if CONFIG_HIGHBITDEPTH -static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void)above; - (void)bd; +void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2); +} - for (r = 0; r < bh; ++r) { - for (c = 0; c < bw; ++c) { - dst[c] = c & 1 ? AVG3(left[(c >> 1) + r], left[(c >> 1) + r + 1], - left[(c >> 1) + r + 2]) - : AVG2(left[(c >> 1) + r], left[(c >> 1) + r + 1]); - } - dst += stride; - } +void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4); } -static INLINE void highbd_d63e_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void)left; - (void)bd; - for (r = 0; r < bh; ++r) { - for (c = 0; c < bw; ++c) { - dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], - above[(r >> 1) + c + 2]) - : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); - } - dst += stride; - } +void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4); } -static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void)left; - (void)bd; - for (r = 0; r < bh; ++r) { - for (c = 0; c < bw; ++c) { - dst[c] = AVG3(above[r + c], above[r + c + 1], - above[r + c + 1 + (r + c + 2 < bw + bh)]); - } - dst += stride; - } +void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2); } -static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void)bd; +void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2); +} - // first row - for (c = 0; c < bw; c++) dst[c] = AVG2(above[c - 1], above[c]); - dst += stride; +void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4); +} - // second row - dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); - dst += stride; +void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4); +} - // the rest of first col - dst[0] = AVG3(above[-1], left[0], left[1]); - for (r = 3; r < bh; ++r) - dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); +void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2); +} - // the rest of the block - for (r = 2; r < bh; ++r) { - for (c = 1; c < bw; c++) dst[c] = dst[-2 * stride + c - 1]; - dst += stride; - } +void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2); } -static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void)bd; - dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bw; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); +void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4); +} - dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bh; ++r) - dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); +void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4); +} - dst += stride; - for (r = 1; r < bh; ++r) { - for (c = 1; c < bw; c++) dst[c] = dst[-stride + c - 1]; - dst += stride; - } +void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2); } -static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride, - int bw, int bh, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void)bd; - dst[0] = AVG2(above[-1], left[0]); - for (r = 1; r < bh; r++) dst[r * stride] = AVG2(left[r - 1], left[r]); - dst++; - - dst[0] = AVG3(left[0], above[-1], above[0]); - dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bh; r++) - dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); - dst++; - - for (c = 0; c < bw - 2; c++) - dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); - dst += stride; - - for (r = 1; r < bh; ++r) { - for (c = 0; c < bw - 2; c++) dst[c] = dst[-stride + c - 2]; - dst += stride; - } +void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2); } +#undef DC_MULTIPLIER_1X2 +#undef DC_MULTIPLIER_1X4 + static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { @@ -658,93 +384,6 @@ static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw, } } -void aom_highbd_d207_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - (void)above; - (void)bd; - DST(0, 0) = AVG2(I, J); - DST(0, 1) = AVG2(J, K); - DST(1, 0) = AVG3(I, J, K); - DST(1, 1) = AVG3(J, K, L); -} - -void aom_highbd_d63_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, const uint16_t *left, - int bd) { - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - const int D = above[3]; - (void)left; - (void)bd; - DST(0, 0) = AVG2(A, B); - DST(1, 0) = AVG2(B, C); - DST(0, 1) = AVG3(A, B, C); - DST(1, 1) = AVG3(B, C, D); -} - -void aom_highbd_d45e_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - const int D = above[3]; - (void)stride; - (void)left; - (void)bd; - DST(0, 0) = AVG3(A, B, C); - DST(1, 0) = DST(0, 1) = AVG3(B, C, D); - DST(1, 1) = AVG3(C, D, D); -} - -void aom_highbd_d117_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int X = above[-1]; - const int A = above[0]; - const int B = above[1]; - (void)bd; - DST(0, 0) = AVG2(X, A); - DST(1, 0) = AVG2(A, B); - DST(0, 1) = AVG3(I, X, A); - DST(1, 1) = AVG3(X, A, B); -} - -void aom_highbd_d135_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int X = above[-1]; - const int A = above[0]; - const int B = above[1]; - (void)bd; - DST(0, 1) = AVG3(X, I, J); - DST(1, 1) = DST(0, 0) = AVG3(A, X, I); - DST(1, 0) = AVG3(B, A, X); -} - -void aom_highbd_d153_predictor_2x2_c(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int X = above[-1]; - const int A = above[0]; - (void)bd; - DST(0, 0) = AVG2(I, X); - DST(0, 1) = AVG2(J, I); - DST(1, 0) = AVG3(I, X, A); - DST(1, 1) = AVG3(J, I, X); -} - static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { @@ -763,6 +402,7 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { + (void)bd; const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel const uint8_t *const sm_weights_w = sm_weight_arrays + bw; @@ -785,17 +425,17 @@ static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride, for (i = 0; i < 4; ++i) { this_pred += weights[i] * pixels[i]; } - dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd); + dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } } -#if CONFIG_SMOOTH_HV static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { + (void)bd; const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel const uint8_t *const sm_weights = sm_weight_arrays + bh; // scale = 2^sm_weight_log2_scale @@ -816,7 +456,7 @@ static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride, for (i = 0; i < 2; ++i) { this_pred += weights[i] * pixels[i]; } - dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd); + dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } @@ -826,6 +466,7 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int bd) { + (void)bd; const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel const uint8_t *const sm_weights = sm_weight_arrays + bw; // scale = 2^sm_weight_log2_scale @@ -846,12 +487,11 @@ static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride, for (i = 0; i < 2; ++i) { this_pred += weights[i] * pixels[i]; } - dst[c] = clip_pixel_highbd(divide_round(this_pred, log2_scale), bd); + dst[c] = divide_round(this_pred, log2_scale); } dst += stride; } } -#endif // CONFIG_SMOOTH_HV static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, int bw, int bh, @@ -922,7 +562,148 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, dst += stride; } } -#endif // CONFIG_HIGHBITDEPTH + +// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but +// assume 2nd shift of 17 bits instead of 16. +// Note: Strictly speaking, 2nd shift needs to be 17 only when: +// - bit depth == 12, and +// - bw + bh is divisible by 5 (as opposed to divisible by 3). +// All other cases can use half the multipliers with a shift of 16 instead. +// This special optimization can be used when writing assembly code. +#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB +// Note: This constant is odd, but a smaller even constant (0x199a) with the +// appropriate shift should work for neon in 8/10-bit. +#define HIGHBD_DC_MULTIPLIER_1X4 0x6667 + +#define HIGHBD_DC_SHIFT2 17 + +static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride, + int bw, int bh, + const uint16_t *above, + const uint16_t *left, int bd, + int shift1, uint32_t multiplier) { + int sum = 0; + (void)bd; + + for (int i = 0; i < bw; i++) { + sum += above[i]; + } + for (int i = 0; i < bh; i++) { + sum += left[i]; + } + + const int expected_dc = divide_using_multiply_shift( + sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2); + assert(expected_dc < (1 << bd)); + + for (int r = 0; r < bh; r++) { + aom_memset16(dst, expected_dc, bw); + dst += stride; + } +} + +#undef HIGHBD_DC_SHIFT2 + +void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd) { + highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4, + HIGHBD_DC_MULTIPLIER_1X4); +} + +void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5, + HIGHBD_DC_MULTIPLIER_1X2); +} + +void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5, + HIGHBD_DC_MULTIPLIER_1X2); +} + +#undef HIGHBD_DC_MULTIPLIER_1X2 +#undef HIGHBD_DC_MULTIPLIER_1X4 // This serves as a wrapper function, so that all the prediction functions // can be unified and accessed as a pointer array. Note that the boundary @@ -934,7 +715,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, type##_predictor(dst, stride, width, height, above, left); \ } -#if CONFIG_HIGHBITDEPTH #define intra_pred_highbd_sized(type, width, height) \ void aom_highbd_##type##_predictor_##width##x##height##_c( \ uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ @@ -943,7 +723,6 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, } /* clang-format off */ -#if CONFIG_TX64X64 #define intra_pred_rectangular(type) \ intra_pred_sized(type, 4, 8) \ intra_pred_sized(type, 8, 4) \ @@ -953,6 +732,12 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, intra_pred_sized(type, 32, 16) \ intra_pred_sized(type, 32, 64) \ intra_pred_sized(type, 64, 32) \ + intra_pred_sized(type, 4, 16) \ + intra_pred_sized(type, 16, 4) \ + intra_pred_sized(type, 8, 32) \ + intra_pred_sized(type, 32, 8) \ + intra_pred_sized(type, 16, 64) \ + intra_pred_sized(type, 64, 16) \ intra_pred_highbd_sized(type, 4, 8) \ intra_pred_highbd_sized(type, 8, 4) \ intra_pred_highbd_sized(type, 8, 16) \ @@ -960,7 +745,13 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, intra_pred_highbd_sized(type, 16, 32) \ intra_pred_highbd_sized(type, 32, 16) \ intra_pred_highbd_sized(type, 32, 64) \ - intra_pred_highbd_sized(type, 64, 32) + intra_pred_highbd_sized(type, 64, 32) \ + intra_pred_highbd_sized(type, 4, 16) \ + intra_pred_highbd_sized(type, 16, 4) \ + intra_pred_highbd_sized(type, 8, 32) \ + intra_pred_highbd_sized(type, 32, 8) \ + intra_pred_highbd_sized(type, 16, 64) \ + intra_pred_highbd_sized(type, 64, 16) #define intra_pred_above_4x4(type) \ intra_pred_sized(type, 8, 8) \ intra_pred_sized(type, 16, 16) \ @@ -973,100 +764,29 @@ static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, intra_pred_highbd_sized(type, 64, 64) \ intra_pred_rectangular(type) #define intra_pred_allsizes(type) \ - intra_pred_sized(type, 2, 2) \ intra_pred_sized(type, 4, 4) \ - intra_pred_highbd_sized(type, 2, 2) \ intra_pred_above_4x4(type) -#else // CONFIG_TX64X64 -#define intra_pred_rectangular(type) \ - intra_pred_sized(type, 4, 8) \ - intra_pred_sized(type, 8, 4) \ - intra_pred_sized(type, 8, 16) \ - intra_pred_sized(type, 16, 8) \ - intra_pred_sized(type, 16, 32) \ - intra_pred_sized(type, 32, 16) \ - intra_pred_highbd_sized(type, 4, 8) \ - intra_pred_highbd_sized(type, 8, 4) \ - intra_pred_highbd_sized(type, 8, 16) \ - intra_pred_highbd_sized(type, 16, 8) \ - intra_pred_highbd_sized(type, 16, 32) \ - intra_pred_highbd_sized(type, 32, 16) -#define intra_pred_above_4x4(type) \ +#define intra_pred_square(type) \ + intra_pred_sized(type, 4, 4) \ intra_pred_sized(type, 8, 8) \ intra_pred_sized(type, 16, 16) \ intra_pred_sized(type, 32, 32) \ + intra_pred_sized(type, 64, 64) \ intra_pred_highbd_sized(type, 4, 4) \ intra_pred_highbd_sized(type, 8, 8) \ intra_pred_highbd_sized(type, 16, 16) \ intra_pred_highbd_sized(type, 32, 32) \ - intra_pred_rectangular(type) -#define intra_pred_allsizes(type) \ - intra_pred_sized(type, 2, 2) \ - intra_pred_sized(type, 4, 4) \ - intra_pred_highbd_sized(type, 2, 2) \ - intra_pred_above_4x4(type) -#endif // CONFIG_TX64X64 - -#else - -#if CONFIG_TX64X64 -#define intra_pred_rectangular(type) \ - intra_pred_sized(type, 4, 8) \ - intra_pred_sized(type, 8, 4) \ - intra_pred_sized(type, 8, 16) \ - intra_pred_sized(type, 16, 8) \ - intra_pred_sized(type, 16, 32) \ - intra_pred_sized(type, 32, 16) \ - intra_pred_sized(type, 32, 64) \ - intra_pred_sized(type, 64, 32) -#define intra_pred_above_4x4(type) \ - intra_pred_sized(type, 8, 8) \ - intra_pred_sized(type, 16, 16) \ - intra_pred_sized(type, 32, 32) \ - intra_pred_sized(type, 64, 64) \ - intra_pred_rectangular(type) -#define intra_pred_allsizes(type) \ - intra_pred_sized(type, 2, 2) \ - intra_pred_sized(type, 4, 4) \ - intra_pred_above_4x4(type) -#else // CONFIG_TX64X64 -#define intra_pred_rectangular(type) \ - intra_pred_sized(type, 4, 8) \ - intra_pred_sized(type, 8, 4) \ - intra_pred_sized(type, 8, 16) \ - intra_pred_sized(type, 16, 8) \ - intra_pred_sized(type, 16, 32) \ - intra_pred_sized(type, 32, 16) -#define intra_pred_above_4x4(type) \ - intra_pred_sized(type, 8, 8) \ - intra_pred_sized(type, 16, 16) \ - intra_pred_sized(type, 32, 32) \ - intra_pred_rectangular(type) -#define intra_pred_allsizes(type) \ - intra_pred_sized(type, 2, 2) \ - intra_pred_sized(type, 4, 4) \ - intra_pred_above_4x4(type) -#endif // CONFIG_TX64X64 - -#endif // CONFIG_HIGHBITDEPTH + intra_pred_highbd_sized(type, 64, 64) -intra_pred_allsizes(d207e) -intra_pred_allsizes(d63e) -intra_pred_above_4x4(d45e) -intra_pred_above_4x4(d117) -intra_pred_above_4x4(d135) -intra_pred_above_4x4(d153) intra_pred_allsizes(v) intra_pred_allsizes(h) intra_pred_allsizes(smooth) -#if CONFIG_SMOOTH_HV intra_pred_allsizes(smooth_v) intra_pred_allsizes(smooth_h) -#endif // CONFIG_SMOOTH_HV intra_pred_allsizes(paeth) intra_pred_allsizes(dc_128) intra_pred_allsizes(dc_left) intra_pred_allsizes(dc_top) -intra_pred_allsizes(dc) +intra_pred_square(dc) /* clang-format on */ #undef intra_pred_allsizes diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h index 96da49b03..e047d98bc 100644 --- a/third_party/aom/aom_dsp/intrapred_common.h +++ b/third_party/aom/aom_dsp/intrapred_common.h @@ -12,19 +12,16 @@ #ifndef _AOM_DSP_INTRAPRED_COMMON_H #define _AOM_DSP_INTRAPRED_COMMON_H -#include "./aom_config.h" +#include "config/aom_config.h" // Weights are quadratic from '1' to '1 / block_size', scaled by // 2^sm_weight_log2_scale. static const int sm_weight_log2_scale = 8; -#if CONFIG_TX64X64 // max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST]) #define MAX_BLOCK_DIM 64 -#else -#define MAX_BLOCK_DIM 32 -#endif // CONFIG_TX64X64 +/* clang-format off */ static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { // Unused, because we always offset by bs, which is at least 2. 0, 0, @@ -39,13 +36,12 @@ static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { // bs = 32 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74, 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8, -#if CONFIG_TX64X64 // bs = 64 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156, 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4, -#endif // CONFIG_TX64X64 }; +/* clang-format on */ #endif // _AOM_DSP_INTRAPRED_COMMON_H diff --git a/third_party/aom/aom_dsp/inv_txfm.c b/third_party/aom/aom_dsp/inv_txfm.c deleted file mode 100644 index 6b7c1c2ab..000000000 --- a/third_party/aom/aom_dsp/inv_txfm.c +++ /dev/null @@ -1,1482 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/inv_txfm.h" -#if CONFIG_DAALA_DCT4 || CONFIG_DAALA_DCT8 || CONFIG_DAALA_DCT16 || \ - CONFIG_DAALA_DCT32 || CONFIG_DAALA_DCT64 -#include "av1/common/daala_tx.h" -#endif - -void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, - 0.5 shifts per pixel. */ - int i; - tran_low_t output[16]; - tran_high_t a1, b1, c1, d1, e1; - const tran_low_t *ip = input; - tran_low_t *op = output; - - for (i = 0; i < 4; i++) { - a1 = ip[0] >> UNIT_QUANT_SHIFT; - c1 = ip[1] >> UNIT_QUANT_SHIFT; - d1 = ip[2] >> UNIT_QUANT_SHIFT; - b1 = ip[3] >> UNIT_QUANT_SHIFT; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - op[0] = WRAPLOW(a1); - op[1] = WRAPLOW(b1); - op[2] = WRAPLOW(c1); - op[3] = WRAPLOW(d1); - ip += 4; - op += 4; - } - - ip = output; - for (i = 0; i < 4; i++) { - a1 = ip[4 * 0]; - c1 = ip[4 * 1]; - d1 = ip[4 * 2]; - b1 = ip[4 * 3]; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1)); - dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1)); - dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1)); - dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1)); - - ip++; - dest++; - } -} - -void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { - int i; - tran_high_t a1, e1; - tran_low_t tmp[4]; - const tran_low_t *ip = in; - tran_low_t *op = tmp; - - a1 = ip[0] >> UNIT_QUANT_SHIFT; - e1 = a1 >> 1; - a1 -= e1; - op[0] = WRAPLOW(a1); - op[1] = op[2] = op[3] = WRAPLOW(e1); - - ip = tmp; - for (i = 0; i < 4; i++) { - e1 = ip[0] >> 1; - a1 = ip[0] - e1; - dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); - dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); - dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); - dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); - ip++; - dest++; - } -} - -void aom_idct4_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step[4]; - tran_high_t temp1, temp2; - // stage 1 - temp1 = (input[0] + input[2]) * cospi_16_64; - temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(dct_const_round_shift(temp1)); - step[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(dct_const_round_shift(temp1)); - step[3] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 2 - output[0] = WRAPLOW(step[0] + step[3]); - output[1] = WRAPLOW(step[1] + step[2]); - output[2] = WRAPLOW(step[1] - step[2]); - output[3] = WRAPLOW(step[0] - step[3]); -} - -void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[4 * 4]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[4], temp_out[4]; - - // Rows - for (i = 0; i < 4; ++i) { - aom_idct4_c(input, outptr); - input += 4; - outptr += 4; - } - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; - aom_idct4_c(temp_in, temp_out); - for (j = 0; j < 4; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 4)); - } - } -} - -void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - int i; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 4); - - if (a1 == 0) return; - - for (i = 0; i < 4; i++) { - dest[0] = clip_pixel_add(dest[0], a1); - dest[1] = clip_pixel_add(dest[1], a1); - dest[2] = clip_pixel_add(dest[2], a1); - dest[3] = clip_pixel_add(dest[3], a1); - dest += dest_stride; - } -} - -void aom_idct8_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[8], step2[8]; - tran_high_t temp1, temp2; - // stage 1 - step1[0] = input[0]; - step1[2] = input[4]; - step1[1] = input[2]; - step1[3] = input[6]; - temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 2 - temp1 = (step1[0] + step1[2]) * cospi_16_64; - temp2 = (step1[0] - step1[2]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); - - // stage 3 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - step1[7] = step2[7]; - - // stage 4 - output[0] = WRAPLOW(step1[0] + step1[7]); - output[1] = WRAPLOW(step1[1] + step1[6]); - output[2] = WRAPLOW(step1[2] + step1[5]); - output[3] = WRAPLOW(step1[3] + step1[4]); - output[4] = WRAPLOW(step1[3] - step1[4]); - output[5] = WRAPLOW(step1[2] - step1[5]); - output[6] = WRAPLOW(step1[1] - step1[6]); - output[7] = WRAPLOW(step1[0] - step1[7]); -} - -void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[8 * 8]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - - // First transform rows - for (i = 0; i < 8; ++i) { - aom_idct8_c(input, outptr); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - aom_idct8_c(temp_in, temp_out); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5)); - } - } -} - -void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 5); - if (a1 == 0) return; - for (j = 0; j < 8; ++j) { - for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } -} - -void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - - tran_low_t x0 = input[0]; - tran_low_t x1 = input[1]; - tran_low_t x2 = input[2]; - tran_low_t x3 = input[3]; - - if (!(x0 | x1 | x2 | x3)) { - output[0] = output[1] = output[2] = output[3] = 0; - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = WRAPLOW(x0 - x2 + x3); - - s0 = s0 + s3 + s5; - s1 = s1 - s4 - s6; - s3 = s2; - s2 = sinpi_3_9 * s7; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); - output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); - output[2] = WRAPLOW(dct_const_round_shift(s2)); - output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); -} - -void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; - - tran_high_t x0 = input[7]; - tran_high_t x1 = input[0]; - tran_high_t x2 = input[5]; - tran_high_t x3 = input[2]; - tran_high_t x4 = input[3]; - tran_high_t x5 = input[4]; - tran_high_t x6 = input[1]; - tran_high_t x7 = input[6]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = 0; - return; - } - - // stage 1 - s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); - s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); - s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); - s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); - s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); - s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); - s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); - s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); - - x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); - x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); - x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); - x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); - x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); - x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); - x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); - - // stage 2 - s0 = (int)x0; - s1 = (int)x1; - s2 = (int)x2; - s3 = (int)x3; - s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); - s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); - s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); - s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); - - x0 = WRAPLOW(s0 + s2); - x1 = WRAPLOW(s1 + s3); - x2 = WRAPLOW(s0 - s2); - x3 = WRAPLOW(s1 - s3); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); - - // stage 3 - s2 = (int)(cospi_16_64 * (x2 + x3)); - s3 = (int)(cospi_16_64 * (x2 - x3)); - s6 = (int)(cospi_16_64 * (x6 + x7)); - s7 = (int)(cospi_16_64 * (x6 - x7)); - - x2 = WRAPLOW(dct_const_round_shift(s2)); - x3 = WRAPLOW(dct_const_round_shift(s3)); - x6 = WRAPLOW(dct_const_round_shift(s6)); - x7 = WRAPLOW(dct_const_round_shift(s7)); - - output[0] = WRAPLOW(x0); - output[1] = WRAPLOW(-x4); - output[2] = WRAPLOW(x6); - output[3] = WRAPLOW(-x2); - output[4] = WRAPLOW(x3); - output[5] = WRAPLOW(-x7); - output[6] = WRAPLOW(x5); - output[7] = WRAPLOW(-x1); -} - -void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - - // First transform rows - // only first 4 row has non-zero coefs - for (i = 0; i < 4; ++i) { - aom_idct8_c(input, outptr); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - aom_idct8_c(temp_in, temp_out); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5)); - } - } -} - -void aom_idct16_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[16], step2[16]; - tran_high_t temp1, temp2; - - // stage 1 - step1[0] = input[0 / 2]; - step1[1] = input[16 / 2]; - step1[2] = input[8 / 2]; - step1[3] = input[24 / 2]; - step1[4] = input[4 / 2]; - step1[5] = input[20 / 2]; - step1[6] = input[12 / 2]; - step1[7] = input[28 / 2]; - step1[8] = input[2 / 2]; - step1[9] = input[18 / 2]; - step1[10] = input[10 / 2]; - step1[11] = input[26 / 2]; - step1[12] = input[6 / 2]; - step1[13] = input[22 / 2]; - step1[14] = input[14 / 2]; - step1[15] = input[30 / 2]; - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1)); - step2[15] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - - step1[8] = WRAPLOW(step2[8] + step2[9]); - step1[9] = WRAPLOW(step2[8] - step2[9]); - step1[10] = WRAPLOW(-step2[10] + step2[11]); - step1[11] = WRAPLOW(step2[10] + step2[11]); - step1[12] = WRAPLOW(step2[12] + step2[13]); - step1[13] = WRAPLOW(step2[12] - step2[13]); - step1[14] = WRAPLOW(-step2[14] + step2[15]); - step1[15] = WRAPLOW(step2[14] + step2[15]); - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - step2[11] = step1[11]; - step2[12] = step1[12]; - - // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - step1[7] = step2[7]; - - step1[8] = WRAPLOW(step2[8] + step2[11]); - step1[9] = WRAPLOW(step2[9] + step2[10]); - step1[10] = WRAPLOW(step2[9] - step2[10]); - step1[11] = WRAPLOW(step2[8] - step2[11]); - step1[12] = WRAPLOW(-step2[12] + step2[15]); - step1[13] = WRAPLOW(-step2[13] + step2[14]); - step1[14] = WRAPLOW(step2[13] + step2[14]); - step1[15] = WRAPLOW(step2[12] + step2[15]); - - // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7]); - step2[1] = WRAPLOW(step1[1] + step1[6]); - step2[2] = WRAPLOW(step1[2] + step1[5]); - step2[3] = WRAPLOW(step1[3] + step1[4]); - step2[4] = WRAPLOW(step1[3] - step1[4]); - step2[5] = WRAPLOW(step1[2] - step1[5]); - step2[6] = WRAPLOW(step1[1] - step1[6]); - step2[7] = WRAPLOW(step1[0] - step1[7]); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - step2[14] = step1[14]; - step2[15] = step1[15]; - - // stage 7 - output[0] = WRAPLOW(step2[0] + step2[15]); - output[1] = WRAPLOW(step2[1] + step2[14]); - output[2] = WRAPLOW(step2[2] + step2[13]); - output[3] = WRAPLOW(step2[3] + step2[12]); - output[4] = WRAPLOW(step2[4] + step2[11]); - output[5] = WRAPLOW(step2[5] + step2[10]); - output[6] = WRAPLOW(step2[6] + step2[9]); - output[7] = WRAPLOW(step2[7] + step2[8]); - output[8] = WRAPLOW(step2[7] - step2[8]); - output[9] = WRAPLOW(step2[6] - step2[9]); - output[10] = WRAPLOW(step2[5] - step2[10]); - output[11] = WRAPLOW(step2[4] - step2[11]); - output[12] = WRAPLOW(step2[3] - step2[12]); - output[13] = WRAPLOW(step2[2] - step2[13]); - output[14] = WRAPLOW(step2[1] - step2[14]); - output[15] = WRAPLOW(step2[0] - step2[15]); -} - -void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[16 * 16]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - - // First transform rows - for (i = 0; i < 16; ++i) { - aom_idct16_c(input, outptr); - input += 16; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - aom_idct16_c(temp_in, temp_out); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - - tran_high_t x0 = input[15]; - tran_high_t x1 = input[0]; - tran_high_t x2 = input[13]; - tran_high_t x3 = input[2]; - tran_high_t x4 = input[11]; - tran_high_t x5 = input[4]; - tran_high_t x6 = input[9]; - tran_high_t x7 = input[6]; - tran_high_t x8 = input[7]; - tran_high_t x9 = input[8]; - tran_high_t x10 = input[5]; - tran_high_t x11 = input[10]; - tran_high_t x12 = input[3]; - tran_high_t x13 = input[12]; - tran_high_t x14 = input[1]; - tran_high_t x15 = input[14]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = output[8] = output[9] = output[10] = - output[11] = output[12] = output[13] = output[14] = output[15] = 0; - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); - x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); - x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); - x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); - x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); - x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); - x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); - x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); - x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); - x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); - x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); - x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); - x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); - x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = WRAPLOW(s0 + s4); - x1 = WRAPLOW(s1 + s5); - x2 = WRAPLOW(s2 + s6); - x3 = WRAPLOW(s3 + s7); - x4 = WRAPLOW(s0 - s4); - x5 = WRAPLOW(s1 - s5); - x6 = WRAPLOW(s2 - s6); - x7 = WRAPLOW(s3 - s7); - x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); - x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); - x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); - x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); - x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); - x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); - x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = WRAPLOW(s0 + s2); - x1 = WRAPLOW(s1 + s3); - x2 = WRAPLOW(s0 - s2); - x3 = WRAPLOW(s1 - s3); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); - x8 = WRAPLOW(s8 + s10); - x9 = WRAPLOW(s9 + s11); - x10 = WRAPLOW(s8 - s10); - x11 = WRAPLOW(s9 - s11); - x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); - x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); - x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = WRAPLOW(dct_const_round_shift(s2)); - x3 = WRAPLOW(dct_const_round_shift(s3)); - x6 = WRAPLOW(dct_const_round_shift(s6)); - x7 = WRAPLOW(dct_const_round_shift(s7)); - x10 = WRAPLOW(dct_const_round_shift(s10)); - x11 = WRAPLOW(dct_const_round_shift(s11)); - x14 = WRAPLOW(dct_const_round_shift(s14)); - x15 = WRAPLOW(dct_const_round_shift(s15)); - - output[0] = WRAPLOW(x0); - output[1] = WRAPLOW(-x8); - output[2] = WRAPLOW(x12); - output[3] = WRAPLOW(-x4); - output[4] = WRAPLOW(x6); - output[5] = WRAPLOW(x14); - output[6] = WRAPLOW(x10); - output[7] = WRAPLOW(x2); - output[8] = WRAPLOW(x3); - output[9] = WRAPLOW(x11); - output[10] = WRAPLOW(x15); - output[11] = WRAPLOW(x7); - output[12] = WRAPLOW(x5); - output[13] = WRAPLOW(-x13); - output[14] = WRAPLOW(x9); - output[15] = WRAPLOW(-x1); -} - -void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - int i, j; - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - tran_low_t temp_in[16], temp_out[16]; - - // First transform rows. Since all non-zero dct coefficients are in - // upper-left 8x8 area, we only need to calculate first 8 rows here. - for (i = 0; i < 8; ++i) { - aom_idct16_c(input, outptr); - input += 16; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - aom_idct16_c(temp_in, temp_out); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[16 * 16] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - - // First transform rows. Since all non-zero dct coefficients are in - // upper-left 4x4 area, we only need to calculate first 4 rows here. - for (i = 0; i < 4; ++i) { - aom_idct16_c(input, outptr); - input += 16; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; - aom_idct16_c(temp_in, temp_out); - for (j = 0; j < 16; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 6); - if (a1 == 0) return; - for (j = 0; j < 16; ++j) { - for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } -} - -void aom_idct32_c(const tran_low_t *input, tran_low_t *output) { - tran_low_t step1[32], step2[32]; - tran_high_t temp1, temp2; - - // stage 1 - step1[0] = input[0]; - step1[1] = input[16]; - step1[2] = input[8]; - step1[3] = input[24]; - step1[4] = input[4]; - step1[5] = input[20]; - step1[6] = input[12]; - step1[7] = input[28]; - step1[8] = input[2]; - step1[9] = input[18]; - step1[10] = input[10]; - step1[11] = input[26]; - step1[12] = input[6]; - step1[13] = input[22]; - step1[14] = input[14]; - step1[15] = input[30]; - - temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; - temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(dct_const_round_shift(temp1)); - step1[31] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; - temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1)); - step1[30] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; - temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1)); - step1[29] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; - temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1)); - step1[28] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; - temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1)); - step1[27] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; - temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; - temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1)); - step1[25] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; - temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1)); - step1[24] = WRAPLOW(dct_const_round_shift(temp2)); - - // stage 2 - step2[0] = step1[0]; - step2[1] = step1[1]; - step2[2] = step1[2]; - step2[3] = step1[3]; - step2[4] = step1[4]; - step2[5] = step1[5]; - step2[6] = step1[6]; - step2[7] = step1[7]; - - temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; - temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1)); - step2[15] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; - temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; - temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - - temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; - temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - - step2[16] = WRAPLOW(step1[16] + step1[17]); - step2[17] = WRAPLOW(step1[16] - step1[17]); - step2[18] = WRAPLOW(-step1[18] + step1[19]); - step2[19] = WRAPLOW(step1[18] + step1[19]); - step2[20] = WRAPLOW(step1[20] + step1[21]); - step2[21] = WRAPLOW(step1[20] - step1[21]); - step2[22] = WRAPLOW(-step1[22] + step1[23]); - step2[23] = WRAPLOW(step1[22] + step1[23]); - step2[24] = WRAPLOW(step1[24] + step1[25]); - step2[25] = WRAPLOW(step1[24] - step1[25]); - step2[26] = WRAPLOW(-step1[26] + step1[27]); - step2[27] = WRAPLOW(step1[26] + step1[27]); - step2[28] = WRAPLOW(step1[28] + step1[29]); - step2[29] = WRAPLOW(step1[28] - step1[29]); - step2[30] = WRAPLOW(-step1[30] + step1[31]); - step2[31] = WRAPLOW(step1[30] + step1[31]); - - // stage 3 - step1[0] = step2[0]; - step1[1] = step2[1]; - step1[2] = step2[2]; - step1[3] = step2[3]; - - temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; - temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1)); - step1[7] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; - temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - - step1[8] = WRAPLOW(step2[8] + step2[9]); - step1[9] = WRAPLOW(step2[8] - step2[9]); - step1[10] = WRAPLOW(-step2[10] + step2[11]); - step1[11] = WRAPLOW(step2[10] + step2[11]); - step1[12] = WRAPLOW(step2[12] + step2[13]); - step1[13] = WRAPLOW(step2[12] - step2[13]); - step1[14] = WRAPLOW(-step2[14] + step2[15]); - step1[15] = WRAPLOW(step2[14] + step2[15]); - - step1[16] = step2[16]; - step1[31] = step2[31]; - temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; - temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1)); - step1[30] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; - temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1)); - step1[29] = WRAPLOW(dct_const_round_shift(temp2)); - step1[19] = step2[19]; - step1[20] = step2[20]; - temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; - temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; - temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1)); - step1[25] = WRAPLOW(dct_const_round_shift(temp2)); - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[27] = step2[27]; - step1[28] = step2[28]; - - // stage 4 - temp1 = (step1[0] + step1[1]) * cospi_16_64; - temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1)); - step2[1] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1)); - step2[3] = WRAPLOW(dct_const_round_shift(temp2)); - step2[4] = WRAPLOW(step1[4] + step1[5]); - step2[5] = WRAPLOW(step1[4] - step1[5]); - step2[6] = WRAPLOW(-step1[6] + step1[7]); - step2[7] = WRAPLOW(step1[6] + step1[7]); - - step2[8] = step1[8]; - step2[15] = step1[15]; - temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; - temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1)); - step2[14] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; - temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - step2[11] = step1[11]; - step2[12] = step1[12]; - - step2[16] = WRAPLOW(step1[16] + step1[19]); - step2[17] = WRAPLOW(step1[17] + step1[18]); - step2[18] = WRAPLOW(step1[17] - step1[18]); - step2[19] = WRAPLOW(step1[16] - step1[19]); - step2[20] = WRAPLOW(-step1[20] + step1[23]); - step2[21] = WRAPLOW(-step1[21] + step1[22]); - step2[22] = WRAPLOW(step1[21] + step1[22]); - step2[23] = WRAPLOW(step1[20] + step1[23]); - - step2[24] = WRAPLOW(step1[24] + step1[27]); - step2[25] = WRAPLOW(step1[25] + step1[26]); - step2[26] = WRAPLOW(step1[25] - step1[26]); - step2[27] = WRAPLOW(step1[24] - step1[27]); - step2[28] = WRAPLOW(-step1[28] + step1[31]); - step2[29] = WRAPLOW(-step1[29] + step1[30]); - step2[30] = WRAPLOW(step1[29] + step1[30]); - step2[31] = WRAPLOW(step1[28] + step1[31]); - - // stage 5 - step1[0] = WRAPLOW(step2[0] + step2[3]); - step1[1] = WRAPLOW(step2[1] + step2[2]); - step1[2] = WRAPLOW(step2[1] - step2[2]); - step1[3] = WRAPLOW(step2[0] - step2[3]); - step1[4] = step2[4]; - temp1 = (step2[6] - step2[5]) * cospi_16_64; - temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1)); - step1[6] = WRAPLOW(dct_const_round_shift(temp2)); - step1[7] = step2[7]; - - step1[8] = WRAPLOW(step2[8] + step2[11]); - step1[9] = WRAPLOW(step2[9] + step2[10]); - step1[10] = WRAPLOW(step2[9] - step2[10]); - step1[11] = WRAPLOW(step2[8] - step2[11]); - step1[12] = WRAPLOW(-step2[12] + step2[15]); - step1[13] = WRAPLOW(-step2[13] + step2[14]); - step1[14] = WRAPLOW(step2[13] + step2[14]); - step1[15] = WRAPLOW(step2[12] + step2[15]); - - step1[16] = step2[16]; - step1[17] = step2[17]; - temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; - temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1)); - step1[29] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; - temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1)); - step1[28] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; - temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1)); - step1[27] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; - temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - step1[22] = step2[22]; - step1[23] = step2[23]; - step1[24] = step2[24]; - step1[25] = step2[25]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // stage 6 - step2[0] = WRAPLOW(step1[0] + step1[7]); - step2[1] = WRAPLOW(step1[1] + step1[6]); - step2[2] = WRAPLOW(step1[2] + step1[5]); - step2[3] = WRAPLOW(step1[3] + step1[4]); - step2[4] = WRAPLOW(step1[3] - step1[4]); - step2[5] = WRAPLOW(step1[2] - step1[5]); - step2[6] = WRAPLOW(step1[1] - step1[6]); - step2[7] = WRAPLOW(step1[0] - step1[7]); - step2[8] = step1[8]; - step2[9] = step1[9]; - temp1 = (-step1[10] + step1[13]) * cospi_16_64; - temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1)); - step2[13] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step1[11] + step1[12]) * cospi_16_64; - temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1)); - step2[12] = WRAPLOW(dct_const_round_shift(temp2)); - step2[14] = step1[14]; - step2[15] = step1[15]; - - step2[16] = WRAPLOW(step1[16] + step1[23]); - step2[17] = WRAPLOW(step1[17] + step1[22]); - step2[18] = WRAPLOW(step1[18] + step1[21]); - step2[19] = WRAPLOW(step1[19] + step1[20]); - step2[20] = WRAPLOW(step1[19] - step1[20]); - step2[21] = WRAPLOW(step1[18] - step1[21]); - step2[22] = WRAPLOW(step1[17] - step1[22]); - step2[23] = WRAPLOW(step1[16] - step1[23]); - - step2[24] = WRAPLOW(-step1[24] + step1[31]); - step2[25] = WRAPLOW(-step1[25] + step1[30]); - step2[26] = WRAPLOW(-step1[26] + step1[29]); - step2[27] = WRAPLOW(-step1[27] + step1[28]); - step2[28] = WRAPLOW(step1[27] + step1[28]); - step2[29] = WRAPLOW(step1[26] + step1[29]); - step2[30] = WRAPLOW(step1[25] + step1[30]); - step2[31] = WRAPLOW(step1[24] + step1[31]); - - // stage 7 - step1[0] = WRAPLOW(step2[0] + step2[15]); - step1[1] = WRAPLOW(step2[1] + step2[14]); - step1[2] = WRAPLOW(step2[2] + step2[13]); - step1[3] = WRAPLOW(step2[3] + step2[12]); - step1[4] = WRAPLOW(step2[4] + step2[11]); - step1[5] = WRAPLOW(step2[5] + step2[10]); - step1[6] = WRAPLOW(step2[6] + step2[9]); - step1[7] = WRAPLOW(step2[7] + step2[8]); - step1[8] = WRAPLOW(step2[7] - step2[8]); - step1[9] = WRAPLOW(step2[6] - step2[9]); - step1[10] = WRAPLOW(step2[5] - step2[10]); - step1[11] = WRAPLOW(step2[4] - step2[11]); - step1[12] = WRAPLOW(step2[3] - step2[12]); - step1[13] = WRAPLOW(step2[2] - step2[13]); - step1[14] = WRAPLOW(step2[1] - step2[14]); - step1[15] = WRAPLOW(step2[0] - step2[15]); - - step1[16] = step2[16]; - step1[17] = step2[17]; - step1[18] = step2[18]; - step1[19] = step2[19]; - temp1 = (-step2[20] + step2[27]) * cospi_16_64; - temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1)); - step1[27] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step2[21] + step2[26]) * cospi_16_64; - temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1)); - step1[26] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step2[22] + step2[25]) * cospi_16_64; - temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1)); - step1[25] = WRAPLOW(dct_const_round_shift(temp2)); - temp1 = (-step2[23] + step2[24]) * cospi_16_64; - temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1)); - step1[24] = WRAPLOW(dct_const_round_shift(temp2)); - step1[28] = step2[28]; - step1[29] = step2[29]; - step1[30] = step2[30]; - step1[31] = step2[31]; - - // final stage - output[0] = WRAPLOW(step1[0] + step1[31]); - output[1] = WRAPLOW(step1[1] + step1[30]); - output[2] = WRAPLOW(step1[2] + step1[29]); - output[3] = WRAPLOW(step1[3] + step1[28]); - output[4] = WRAPLOW(step1[4] + step1[27]); - output[5] = WRAPLOW(step1[5] + step1[26]); - output[6] = WRAPLOW(step1[6] + step1[25]); - output[7] = WRAPLOW(step1[7] + step1[24]); - output[8] = WRAPLOW(step1[8] + step1[23]); - output[9] = WRAPLOW(step1[9] + step1[22]); - output[10] = WRAPLOW(step1[10] + step1[21]); - output[11] = WRAPLOW(step1[11] + step1[20]); - output[12] = WRAPLOW(step1[12] + step1[19]); - output[13] = WRAPLOW(step1[13] + step1[18]); - output[14] = WRAPLOW(step1[14] + step1[17]); - output[15] = WRAPLOW(step1[15] + step1[16]); - output[16] = WRAPLOW(step1[15] - step1[16]); - output[17] = WRAPLOW(step1[14] - step1[17]); - output[18] = WRAPLOW(step1[13] - step1[18]); - output[19] = WRAPLOW(step1[12] - step1[19]); - output[20] = WRAPLOW(step1[11] - step1[20]); - output[21] = WRAPLOW(step1[10] - step1[21]); - output[22] = WRAPLOW(step1[9] - step1[22]); - output[23] = WRAPLOW(step1[8] - step1[23]); - output[24] = WRAPLOW(step1[7] - step1[24]); - output[25] = WRAPLOW(step1[6] - step1[25]); - output[26] = WRAPLOW(step1[5] - step1[26]); - output[27] = WRAPLOW(step1[4] - step1[27]); - output[28] = WRAPLOW(step1[3] - step1[28]); - output[29] = WRAPLOW(step1[2] - step1[29]); - output[30] = WRAPLOW(step1[1] - step1[30]); - output[31] = WRAPLOW(step1[0] - step1[31]); -} - -#if CONFIG_MRC_TX -void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, - int stride, uint8_t *mask) { - tran_low_t out[32 * 32]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - - // Rows - for (i = 0; i < 32; ++i) { - int16_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - - if (zero_coeff[0] | zero_coeff[1]) - aom_idct32_c(input, outptr); - else - memset(outptr, 0, sizeof(tran_low_t) * 32); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - aom_idct32_c(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - // Only add the coefficient if the mask value is 1 - int mask_val = mask[j * 32 + i]; - dest[j * stride + i] = - mask_val ? clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)) - : dest[j * stride + i]; - } - } -} - -void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, - uint8_t *mask) { - tran_low_t out[32 * 32] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - - // Rows - // only upper-left 16x16 has non-zero coeff - for (i = 0; i < 16; ++i) { - aom_idct32_c(input, outptr); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - aom_idct32_c(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - // Only add the coefficient if the mask value is 1 - int mask_val = mask[j * 32 + i]; - dest[j * stride + i] = - mask_val ? clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)) - : dest[j * stride + i]; - } - } -} - -void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, - uint8_t *mask) { - tran_low_t out[32 * 32] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - - // Rows - // only upper-left 8x8 has non-zero coeff - for (i = 0; i < 8; ++i) { - aom_idct32_c(input, outptr); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - aom_idct32_c(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - // Only add the coefficient if the mask value is 1 - int mask_val = mask[j * 32 + i]; - dest[j * stride + i] = - mask_val ? clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)) - : dest[j * stride + i]; - } - } -} -#endif // CONFIG_MRC_TX - -void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[32 * 32]; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - - // Rows - for (i = 0; i < 32; ++i) { - int16_t zero_coeff[16]; - for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; - for (j = 0; j < 8; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 4; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - for (j = 0; j < 2; ++j) - zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; - - if (zero_coeff[0] | zero_coeff[1]) - aom_idct32_c(input, outptr); - else - memset(outptr, 0, sizeof(tran_low_t) * 32); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - aom_idct32_c(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[32 * 32] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - - // Rows - // only upper-left 16x16 has non-zero coeff - for (i = 0; i < 16; ++i) { - aom_idct32_c(input, outptr); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - aom_idct32_c(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, - int stride) { - tran_low_t out[32 * 32] = { 0 }; - tran_low_t *outptr = out; - int i, j; - tran_low_t temp_in[32], temp_out[32]; - - // Rows - // only upper-left 8x8 has non-zero coeff - for (i = 0; i < 8; ++i) { - aom_idct32_c(input, outptr); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; - aom_idct32_c(temp_in, temp_out); - for (j = 0; j < 32; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 6)); - } - } -} - -void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_high_t a1; - - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); - a1 = ROUND_POWER_OF_TWO(out, 6); - if (a1 == 0) return; - - for (j = 0; j < 32; ++j) { - for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1); - dest += stride; - } -} - -void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, - 0.5 shifts per pixel. */ - int i; - tran_low_t output[16]; - tran_high_t a1, b1, c1, d1, e1; - const tran_low_t *ip = input; - tran_low_t *op = output; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - for (i = 0; i < 4; i++) { - a1 = ip[0] >> UNIT_QUANT_SHIFT; - c1 = ip[1] >> UNIT_QUANT_SHIFT; - d1 = ip[2] >> UNIT_QUANT_SHIFT; - b1 = ip[3] >> UNIT_QUANT_SHIFT; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - op[0] = HIGHBD_WRAPLOW(a1, bd); - op[1] = HIGHBD_WRAPLOW(b1, bd); - op[2] = HIGHBD_WRAPLOW(c1, bd); - op[3] = HIGHBD_WRAPLOW(d1, bd); - ip += 4; - op += 4; - } - - ip = output; - for (i = 0; i < 4; i++) { - a1 = ip[4 * 0]; - c1 = ip[4 * 1]; - d1 = ip[4 * 2]; - b1 = ip[4 * 3]; - a1 += c1; - d1 -= b1; - e1 = (a1 - d1) >> 1; - b1 = e1 - b1; - c1 = e1 - c1; - a1 -= b1; - d1 += c1; - dest[stride * 0] = - highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd); - dest[stride * 1] = - highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd); - dest[stride * 2] = - highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd); - dest[stride * 3] = - highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd); - - ip++; - dest++; - } -} - -void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, - int dest_stride, int bd) { - int i; - tran_high_t a1, e1; - tran_low_t tmp[4]; - const tran_low_t *ip = in; - tran_low_t *op = tmp; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - (void)bd; - - a1 = ip[0] >> UNIT_QUANT_SHIFT; - e1 = a1 >> 1; - a1 -= e1; - op[0] = HIGHBD_WRAPLOW(a1, bd); - op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd); - - ip = tmp; - for (i = 0; i < 4; i++) { - e1 = ip[0] >> 1; - a1 = ip[0] - e1; - dest[dest_stride * 0] = - highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); - dest[dest_stride * 1] = - highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); - dest[dest_stride * 2] = - highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); - dest[dest_stride * 3] = - highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); - ip++; - dest++; - } -} diff --git a/third_party/aom/aom_dsp/inv_txfm.h b/third_party/aom/aom_dsp/inv_txfm.h deleted file mode 100644 index 644a6599f..000000000 --- a/third_party/aom/aom_dsp/inv_txfm.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_INV_TXFM_H_ -#define AOM_DSP_INV_TXFM_H_ - -#include - -#include "./aom_config.h" -#include "aom_dsp/txfm_common.h" -#include "aom_ports/mem.h" - -#ifdef __cplusplus -extern "C" { -#endif - -static INLINE tran_high_t dct_const_round_shift(tran_high_t input) { - return ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); -} - -static INLINE tran_high_t check_range(tran_high_t input, int bd) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - // For valid AV1 input streams, intermediate stage coefficients should always - // stay within the range of a signed 16 bit integer. Coefficients can go out - // of this range for invalid/corrupt AV1 streams. However, strictly checking - // this range for every intermediate coefficient can burdensome for a decoder, - // therefore the following assertion is only enabled when configured with - // --enable-coefficient-range-checking. - // For valid highbitdepth AV1 streams, intermediate stage coefficients will - // stay within the ranges: - // - 8 bit: signed 16 bit integer - // - 10 bit: signed 18 bit integer - // - 12 bit: signed 20 bit integer - const int32_t int_max = (1 << (7 + bd)) - 1; - const int32_t int_min = -int_max - 1; - assert(int_min <= input); - assert(input <= int_max); - (void)int_min; -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - (void)bd; - return input; -} - -#define WRAPLOW(x) ((int32_t)check_range(x, 8)) -#define HIGHBD_WRAPLOW(x, bd) ((int32_t)check_range((x), bd)) - -#if CONFIG_MRC_TX -// These each perform dct but add coefficients based on a mask -void aom_imrc32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, - int stride, uint8_t *mask); - -void aom_imrc32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride, - uint8_t *mask); - -void aom_imrc32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, - uint8_t *mask); -#endif // CONFIG_MRC_TX - -void aom_idct4_c(const tran_low_t *input, tran_low_t *output); -void aom_idct8_c(const tran_low_t *input, tran_low_t *output); -void aom_idct16_c(const tran_low_t *input, tran_low_t *output); -void aom_idct32_c(const tran_low_t *input, tran_low_t *output); -#if CONFIG_TX64X64 && CONFIG_DAALA_DCT64 -void aom_idct64_c(const tran_low_t *input, tran_low_t *output); -#endif -void aom_iadst4_c(const tran_low_t *input, tran_low_t *output); -void aom_iadst8_c(const tran_low_t *input, tran_low_t *output); -void aom_iadst16_c(const tran_low_t *input, tran_low_t *output); - -void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd); -void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd); -void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd); -void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd); - -void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd); -void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd); -void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd); - -static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans, - int bd) { - trans = HIGHBD_WRAPLOW(trans, bd); - return clip_pixel_highbd(dest + (int)trans, bd); -} - -static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) { - trans = WRAPLOW(trans); - return clip_pixel(dest + (int)trans); -} -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_DSP_INV_TXFM_H_ diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c index 69f131378..a3f261824 100644 --- a/third_party/aom/aom_dsp/loopfilter.c +++ b/third_party/aom/aom_dsp/loopfilter.c @@ -11,8 +11,9 @@ #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" @@ -20,18 +21,6 @@ static INLINE int8_t signed_char_clamp(int t) { return (int8_t)clamp(t, -128, 127); } -#define PARALLEL_DEBLOCKING_11_TAP 0 -#define PARALLEL_DEBLOCKING_9_TAP 0 - -#if CONFIG_DEBLOCK_13TAP -#define PARALLEL_DEBLOCKING_13_TAP 1 -#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 1 -#else -#define PARALLEL_DEBLOCKING_13_TAP 0 -#define PARALLEL_DEBLOCKING_5_TAP_CHROMA 0 -#endif - -#if CONFIG_HIGHBITDEPTH static INLINE int16_t signed_char_clamp_high(int t, int bd) { switch (bd) { case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1); @@ -40,8 +29,7 @@ static INLINE int16_t signed_char_clamp_high(int t, int bd) { default: return (int16_t)clamp(t, -128, 128 - 1); } } -#endif -#if CONFIG_PARALLEL_DEBLOCKING + // should we apply any filter at all: 11111111 yes, 00000000 no static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) { @@ -51,7 +39,7 @@ static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1, mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; return ~mask; } -#endif // CONFIG_PARALLEL_DEBLOCKING + static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2, uint8_t q3) { @@ -66,7 +54,18 @@ static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3, return ~mask; } -#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit, + uint8_t p2, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, uint8_t q2) { + int8_t mask = 0; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, uint8_t q2) { @@ -77,7 +76,6 @@ static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1, mask |= (abs(q2 - q0) > thresh) * -1; return ~mask; } -#endif static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1, @@ -92,39 +90,6 @@ static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2, return ~mask; } -#if PARALLEL_DEBLOCKING_9_TAP -static INLINE int8_t flat_mask2(uint8_t thresh, uint8_t p4, uint8_t p0, - uint8_t q0, uint8_t q4) { - int8_t mask = 0; - mask |= (abs(p4 - p0) > thresh) * -1; - mask |= (abs(q4 - q0) > thresh) * -1; - return ~mask; -} -#endif - -#if PARALLEL_DEBLOCKING_11_TAP -static INLINE int8_t flat_mask3(uint8_t thresh, uint8_t p5, uint8_t p4, - uint8_t p0, uint8_t q0, uint8_t q4, - uint8_t q5) { - int8_t mask = 0; - mask |= (abs(p4 - p0) > thresh) * -1; - mask |= (abs(q4 - q0) > thresh) * -1; - mask |= (abs(p5 - p0) > thresh) * -1; - mask |= (abs(q5 - q0) > thresh) * -1; - return ~mask; -} -#endif - -static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3, - uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0, - uint8_t q1, uint8_t q2, uint8_t q3, - uint8_t q4) { - int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); - mask |= (abs(p4 - p0) > thresh) * -1; - mask |= (abs(q4 - q0) > thresh) * -1; - return ~mask; -} - // is there high edge variance internal edge: 11111111 yes, 00000000 no static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1) { @@ -170,25 +135,14 @@ void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { -#if !CONFIG_PARALLEL_DEBLOCKING - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; - const int8_t mask = - filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); -#else // CONFIG_PARALLEL_DEBLOCKING const uint8_t p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p]; const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); -#endif // !CONFIG_PARALLEL_DEBLOCKING filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p); ++s; } @@ -199,31 +153,20 @@ void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1); + aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1); } void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { -#if !CONFIG_PARALLEL_DEBLOCKING - const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = - filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); -#else // CONFIG_PARALLEL_DEBLOCKING const uint8_t p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1]; const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1); -#endif // !CONFIG_PARALLEL_DEBLOCKING filter4(mask, *thresh, s - 2, s - 1, s, s + 1); s += pitch; } @@ -234,10 +177,9 @@ void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0); - aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); + aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); } -#if PARALLEL_DEBLOCKING_5_TAP_CHROMA static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op2, uint8_t *op1, uint8_t *op0, uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) { @@ -254,7 +196,6 @@ static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat, filter4(mask, thresh, op1, op0, oq0, oq1); } } -#endif static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, uint8_t *op3, uint8_t *op2, uint8_t *op1, @@ -276,40 +217,38 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, } } -#if PARALLEL_DEBLOCKING_5_TAP_CHROMA void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { - const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; const int8_t mask = - filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); ++s; } } -#endif + +void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0); + aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1); +} void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. @@ -331,39 +270,37 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0); - aom_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1); + aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1); } -#if PARALLEL_DEBLOCKING_5_TAP_CHROMA void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif for (i = 0; i < count; ++i) { - const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2]; const int8_t mask = - filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); + filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2); const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2); filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2); s += pitch; } } -#endif + +void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0); + aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); +} void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; @@ -382,10 +319,9 @@ void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0); - aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1); + aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1); } -#if PARALLEL_DEBLOCKING_13_TAP static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint8_t *op6, uint8_t *op5, uint8_t *op4, uint8_t *op3, uint8_t *op2, @@ -433,186 +369,43 @@ static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat, filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); } } -#endif - -#if PARALLEL_DEBLOCKING_11_TAP -static INLINE void filter12(int8_t mask, uint8_t thresh, int8_t flat, - int8_t flat2, uint8_t *op5, uint8_t *op4, - uint8_t *op3, uint8_t *op2, uint8_t *op1, - uint8_t *op0, uint8_t *oq0, uint8_t *oq1, - uint8_t *oq2, uint8_t *oq3, uint8_t *oq4, - uint8_t *oq5) { - if (flat2 && flat && mask) { - const uint8_t p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, - p0 = *op0; - const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, - q5 = *oq5; - - // 11-tap filter [1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1] - *op4 = (p5 * 5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + 6) / 12; - *op3 = (p5 * 4 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + 6) / 12; - *op2 = (p5 * 3 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + 6) / 12; - *op1 = (p5 * 2 + p4 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + q3 + 6) / 12; - *op0 = (p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + q4 + 6) / 12; - *oq0 = (p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + q5 + 6) / 12; - *oq1 = (p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 + q5 * 2 + 6) / 12; - *oq2 = (p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 * 3 + 6) / 12; - *oq3 = (p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 * 4 + 6) / 12; - *oq4 = (p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 5 + 6) / 12; - } else { - filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); - } -} -#endif - -#if PARALLEL_DEBLOCKING_9_TAP -static INLINE void filter10(int8_t mask, uint8_t thresh, int8_t flat, - int8_t flat2, uint8_t *op4, uint8_t *op3, - uint8_t *op2, uint8_t *op1, uint8_t *op0, - uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, - uint8_t *oq3, uint8_t *oq4) { - if (flat2 && flat && mask) { - const uint8_t p4 = *op4, p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; - const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4; - - // 9-tap filter [1, 1, 1, 1, 2, 1, 1, 1, 1] - *op3 = (p4 * 4 + p3 * 2 + p2 + p1 + p0 + q0 + 5) / 10; - *op2 = (p4 * 3 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + 5) / 10; - *op1 = (p4 * 2 + p3 + p2 + p1 * 2 + p0 + q0 + q1 + q2 + 5) / 10; - *op0 = (p4 + p3 + p2 + p1 + p0 * 2 + q0 + q1 + q2 + q3 + 5) / 10; - *oq0 = (p3 + p2 + p1 + p0 + q0 * 2 + q1 + q2 + q3 + q4 + 5) / 10; - *oq1 = (p2 + p1 + p0 + q0 + q1 * 2 + q2 + q3 + q4 * 2 + 5) / 10; - *oq2 = (p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 * 3 + 5) / 10; - *oq3 = (p0 + q0 + q1 + q2 + q3 * 2 + q4 * 4 + 5) / 10; - } else { - filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); - } -} -#endif - -static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat, - int8_t flat2, uint8_t *op7, uint8_t *op6, - uint8_t *op5, uint8_t *op4, uint8_t *op3, - uint8_t *op2, uint8_t *op1, uint8_t *op0, - uint8_t *oq0, uint8_t *oq1, uint8_t *oq2, - uint8_t *oq3, uint8_t *oq4, uint8_t *oq5, - uint8_t *oq6, uint8_t *oq7) { - if (flat2 && flat && mask) { - const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, - p2 = *op2, p1 = *op1, p0 = *op0; - - const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4, - q5 = *oq5, q6 = *oq6, q7 = *oq7; - - // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] - *op6 = ROUND_POWER_OF_TWO( - p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); - *op5 = ROUND_POWER_OF_TWO( - p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); - *op4 = ROUND_POWER_OF_TWO( - p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); - *op3 = ROUND_POWER_OF_TWO( - p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); - *op2 = ROUND_POWER_OF_TWO( - p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4, - 4); - *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + - q0 + q1 + q2 + q3 + q4 + q5, - 4); - *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + - q1 + q2 + q3 + q4 + q5 + q6, - 4); - *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + - q2 + q3 + q4 + q5 + q6 + q7, - 4); - *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + - q3 + q4 + q5 + q6 + q7 * 2, - 4); - *oq2 = ROUND_POWER_OF_TWO( - p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, - 4); - *oq3 = ROUND_POWER_OF_TWO( - p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); - *oq4 = ROUND_POWER_OF_TWO( - p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); - *oq5 = ROUND_POWER_OF_TWO( - p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); - *oq6 = ROUND_POWER_OF_TWO( - p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); - } else { - filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); - } -} static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int step = 4; -#else - int step = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < step * count; ++i) { - const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p], - p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p], - p1 = s[-2 * p], p0 = s[-p]; + const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p], + p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p], - q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p], q7 = s[7 * p]; + q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - -#if PARALLEL_DEBLOCKING_13_TAP - (void)p7; - (void)q7; const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p); - -#elif PARALLEL_DEBLOCKING_11_TAP - const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5); - - filter12(mask, *thresh, flat, flat2, s - 6 * p, s - 5 * p, s - 4 * p, - s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, - s + 3 * p, s + 4 * p, s + 5 * p); - -#elif PARALLEL_DEBLOCKING_9_TAP - const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4); - - filter10(mask, *thresh, flat, flat2, s - 5 * p, s - 4 * p, s - 3 * p, - s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, - s + 4 * p); -#else - const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7); - - filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, - s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, - s + 7 * p); -#endif - ++s; } } -void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { +void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); } -void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); -#else - mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); -#endif +void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1); + mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1); } static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, @@ -621,60 +414,34 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, int i; for (i = 0; i < count; ++i) { - const uint8_t p7 = s[-8], p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], - p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3], + p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4], - q5 = s[5], q6 = s[6], q7 = s[7]; + q5 = s[5], q6 = s[6]; const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); - -#if PARALLEL_DEBLOCKING_13_TAP - (void)p7; - (void)q7; const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6); filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6); -#elif PARALLEL_DEBLOCKING_11_TAP - const int8_t flat2 = flat_mask3(1, p5, p4, p0, q0, q4, q5); - - filter12(mask, *thresh, flat, flat2, s - 6, s - 5, s - 4, s - 3, s - 2, - s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5); -#elif PARALLEL_DEBLOCKING_9_TAP - const int8_t flat2 = flat_mask2(1, p4, p0, q0, q4); - - filter10(mask, *thresh, flat, flat2, s - 5, s - 4, s - 3, s - 2, s - 1, s, - s + 1, s + 2, s + 3, s + 4); - -#else - const int8_t flat2 = flat_mask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7); - - filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4, - s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, - s + 7); -#endif - s += p; } } -void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, +void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4); -#else - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); -#endif } -void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, const uint8_t *thresh) { - mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16); +void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, + const uint8_t *limit0, const uint8_t *thresh0, + const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1) { + mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4); + mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4); } -#if CONFIG_HIGHBITDEPTH -#if CONFIG_PARALLEL_DEBLOCKING // Should we apply any filter at all: 11111111 yes, 00000000 no ? static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, uint16_t p1, uint16_t p0, uint16_t q0, @@ -687,7 +454,6 @@ static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit, mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; return ~mask; } -#endif // CONFIG_PARALLEL_DEBLOCKING // Should we apply any filter at all: 11111111 yes, 00000000 no ? static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, @@ -707,7 +473,22 @@ static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit, return ~mask; } -#if PARALLEL_DEBLOCKING_5_TAP_CHROMA +static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit, + uint16_t p2, uint16_t p1, + uint16_t p0, uint16_t q0, + uint16_t q1, uint16_t q2, + int bd) { + int8_t mask = 0; + int16_t limit16 = (uint16_t)limit << (bd - 8); + int16_t blimit16 = (uint16_t)blimit << (bd - 8); + mask |= (abs(p2 - p1) > limit16) * -1; + mask |= (abs(p1 - p0) > limit16) * -1; + mask |= (abs(q1 - q0) > limit16) * -1; + mask |= (abs(q2 - q1) > limit16) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1; + return ~mask; +} + static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0, uint16_t q1, @@ -720,7 +501,6 @@ static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2, mask |= (abs(q2 - q0) > thresh16) * -1; return ~mask; } -#endif static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, uint16_t p1, uint16_t p0, uint16_t q0, @@ -737,17 +517,6 @@ static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2, return ~mask; } -static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3, - uint16_t p2, uint16_t p1, uint16_t p0, - uint16_t q0, uint16_t q1, uint16_t q2, - uint16_t q3, uint16_t q4, int bd) { - int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd); - int16_t thresh16 = (uint16_t)thresh << (bd - 8); - mask |= (abs(p4 - p0) > thresh16) * -1; - mask |= (abs(q4 - q0) > thresh16) * -1; - return ~mask; -} - // Is there high edge variance internal edge: // 11111111_11111111 yes, 00000000_00000000 no ? static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0, @@ -798,34 +567,17 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { -#if !CONFIG_PARALLEL_DEBLOCKING - const uint16_t p3 = s[-4 * p]; - const uint16_t p2 = s[-3 * p]; - const uint16_t p1 = s[-2 * p]; - const uint16_t p0 = s[-p]; - const uint16_t q0 = s[0 * p]; - const uint16_t q1 = s[1 * p]; - const uint16_t q2 = s[2 * p]; - const uint16_t q3 = s[3 * p]; - const int8_t mask = - highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); -#else // CONFIG_PARALLEL_DEBLOCKING const uint16_t p1 = s[-2 * p]; const uint16_t p0 = s[-p]; const uint16_t q0 = s[0 * p]; const uint16_t q1 = s[1 * p]; const int8_t mask = highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); -#endif // !CONFIG_PARALLEL_DEBLOCKING highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd); ++s; } @@ -836,33 +588,22 @@ void aom_highbd_lpf_horizontal_4_dual_c( const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd); - aom_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd); + aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd); } void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { -#if !CONFIG_PARALLEL_DEBLOCKING - const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; - const int8_t mask = - highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); -#else // CONFIG_PARALLEL_DEBLOCKING const uint16_t p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1]; const int8_t mask = highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd); -#endif // !CONFIG_PARALLEL_DEBLOCKING highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd); s += pitch; } @@ -873,11 +614,10 @@ void aom_highbd_lpf_vertical_4_dual_c( const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd); - aom_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, + aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); } -#if PARALLEL_DEBLOCKING_5_TAP_CHROMA static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op2, uint16_t *op1, uint16_t *op0, uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, @@ -895,7 +635,6 @@ static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat, highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd); } } -#endif static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat, uint16_t *op3, uint16_t *op2, uint16_t *op1, @@ -921,11 +660,7 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. @@ -943,74 +678,75 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, } } -#if PARALLEL_DEBLOCKING_5_TAP_CHROMA void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. for (i = 0; i < count; ++i) { - const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; - const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p]; const int8_t mask = - highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, bd); ++s; } } -#endif + +void aom_highbd_lpf_horizontal_6_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd); +} void aom_highbd_lpf_horizontal_8_dual_c( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd); - aom_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd); + aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd); } -#if PARALLEL_DEBLOCKING_5_TAP_CHROMA void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif for (i = 0; i < count; ++i) { - const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; - const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint16_t q0 = s[0], q1 = s[1], q2 = s[2]; const int8_t mask = - highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); + highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd); const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd); highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2, bd); s += pitch; } } -#endif + +void aom_highbd_lpf_vertical_6_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd); + aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + bd); +} void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int count = 4; -#else - int count = 8; -#endif for (i = 0; i < count; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; @@ -1030,11 +766,10 @@ void aom_highbd_lpf_vertical_8_dual_c( const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd); - aom_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1, + aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1, bd); } -#if PARALLEL_DEBLOCKING_13_TAP static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat, int8_t flat2, uint16_t *op6, uint16_t *op5, uint16_t *op4, uint16_t *op3, uint16_t *op2, @@ -1094,73 +829,6 @@ static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat, bd); } } -#endif - -static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat, - int8_t flat2, uint16_t *op7, uint16_t *op6, - uint16_t *op5, uint16_t *op4, uint16_t *op3, - uint16_t *op2, uint16_t *op1, uint16_t *op0, - uint16_t *oq0, uint16_t *oq1, uint16_t *oq2, - uint16_t *oq3, uint16_t *oq4, uint16_t *oq5, - uint16_t *oq6, uint16_t *oq7, int bd) { - if (flat2 && flat && mask) { - const uint16_t p7 = *op7; - const uint16_t p6 = *op6; - const uint16_t p5 = *op5; - const uint16_t p4 = *op4; - const uint16_t p3 = *op3; - const uint16_t p2 = *op2; - const uint16_t p1 = *op1; - const uint16_t p0 = *op0; - const uint16_t q0 = *oq0; - const uint16_t q1 = *oq1; - const uint16_t q2 = *oq2; - const uint16_t q3 = *oq3; - const uint16_t q4 = *oq4; - const uint16_t q5 = *oq5; - const uint16_t q6 = *oq6; - const uint16_t q7 = *oq7; - - // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] - *op6 = ROUND_POWER_OF_TWO( - p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4); - *op5 = ROUND_POWER_OF_TWO( - p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4); - *op4 = ROUND_POWER_OF_TWO( - p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4); - *op3 = ROUND_POWER_OF_TWO( - p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4); - *op2 = ROUND_POWER_OF_TWO( - p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4, - 4); - *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + - q0 + q1 + q2 + q3 + q4 + q5, - 4); - *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 + - q1 + q2 + q3 + q4 + q5 + q6, - 4); - *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 + - q2 + q3 + q4 + q5 + q6 + q7, - 4); - *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 + - q3 + q4 + q5 + q6 + q7 * 2, - 4); - *oq2 = ROUND_POWER_OF_TWO( - p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, - 4); - *oq3 = ROUND_POWER_OF_TWO( - p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); - *oq4 = ROUND_POWER_OF_TWO( - p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); - *oq5 = ROUND_POWER_OF_TWO( - p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); - *oq6 = ROUND_POWER_OF_TWO( - p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); - } else { - highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3, - bd); - } -} static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, const uint8_t *blimit, @@ -1168,11 +836,7 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, const uint8_t *thresh, int count, int bd) { int i; -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 int step = 4; -#else - int step = 8; -#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. @@ -1190,7 +854,6 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); -#if PARALLEL_DEBLOCKING_13_TAP const int8_t flat2 = highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p], s[5 * p], s[6 * p], bd); @@ -1198,36 +861,22 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd); -#else - const int8_t flat2 = - highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, - s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd); - - highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p, - s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, - s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, - s + 6 * p, s + 7 * p, bd); -#endif ++s; } } -void aom_highbd_lpf_horizontal_edge_8_c(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { +void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); } -void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); -#else - highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); -#endif +void aom_highbd_lpf_horizontal_14_dual_c( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd); + highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd); } static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, @@ -1250,43 +899,27 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd); const int8_t flat = highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd); -#if PARALLEL_DEBLOCKING_13_TAP const int8_t flat2 = highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd); highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, bd); -#else - const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, - q0, s[4], s[5], s[6], s[7], bd); - - highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, - s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, - s + 5, s + 6, s + 7, bd); -#endif s += p; } } -void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, +void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd); -#else - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); -#endif } -void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); -#else - highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); -#endif +void aom_highbd_lpf_vertical_14_dual_c( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd); + highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, + 4, bd); } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c index 4c6e201e1..96d04cff0 100644 --- a/third_party/aom/aom_dsp/mips/add_noise_msa.c +++ b/third_party/aom/aom_dsp/mips/add_noise_msa.c @@ -10,7 +10,8 @@ */ #include -#include "./macros_msa.h" + +#include "aom_dsp/mips/macros_msa.h" void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise, char blackclamp[16], char whiteclamp[16], diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c deleted file mode 100644 index 847394a3d..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_horiz_msa.c +++ /dev/null @@ -1,704 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/aom_convolve_msa.h" - -static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 dst0, dst1, dst2, dst3, res2, res3; - v16u8 mask0, mask1, mask2, mask3; - v8i16 filt, res0, res1; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, res0, res1); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - SRARI_H2_SH(res0, res1, FILTER_BITS); - SAT_SH2_SH(res0, res1, 7); - PCKEV_B2_UB(res0, res0, res1, res1, res2, res3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - XORI_B2_128_UB(res2, res3); - AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v8i16 filt, vec0, vec1, vec2, vec3; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, vec0, vec1); - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, - filt0, filt1, filt2, filt3, vec2, vec3); - SRARI_H4_SH(vec0, vec1, vec2, vec3, FILTER_BITS); - SAT_SH4_SH(vec0, vec1, vec2, vec3, 7); - PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, res0, res1, res2, - res3); - ILVR_D2_UB(res1, res0, res3, res2, res0, res2); - XORI_B2_128_UB(res0, res2); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4); - AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2); - ST4x8_UB(res0, res2, dst, dst_stride); -} - -static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - if (4 == height) { - common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - int32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3; - v8i16 filt, out0, out1, out2, out3; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - XORI_B4_128_SB(src0, src1, src2, src3); - src += (4 * src_stride); - HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, - mask3, filt0, filt1, filt2, filt3, out0, out1, - out2, out3); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - } -} - -static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - int32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 mask0, mask1, mask2, mask3, dst0, dst1; - v8i16 filt, out0, out1, out2, out3; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = height >> 1; loop_cnt--;) { - LD_SB2(src, src_stride, src0, src2); - LD_SB2(src + 8, src_stride, src1, src3); - src += (2 * src_stride); - - XORI_B4_128_SB(src0, src1, src2, src3); - VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); - VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); - VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, - vec14); - VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, - vec15); - DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, - vec9, vec10, vec11); - DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, - vec2, vec3); - DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, - vec9, vec10, vec11); - ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, - out2, out3); - LD_UB2(dst, dst_stride, dst0, dst1); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst); - dst += dst_stride; - PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst); - dst += dst_stride; - } -} - -static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 dst1, dst2, mask0, mask1, mask2, mask3; - v8i16 filt, out0, out1, out2, out3; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = height; loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - - XORI_B4_128_SB(src0, src1, src2, src3); - VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, vec12); - VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, vec13); - VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, - vec14); - VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, - vec15); - DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, - vec9, vec10, vec11); - DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, vec1, - vec2, vec3); - DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, - vec9, vec10, vec11); - ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - LD_UB2(dst, 16, dst1, dst2); - PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst); - PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16); - dst += dst_stride; - } -} - -static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt, cnt; - v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3; - v16u8 dst1, dst2, mask0, mask1, mask2, mask3; - v8i16 filt, out0, out1, out2, out3; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= 3; - - /* rearranging filter */ - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - for (loop_cnt = height; loop_cnt--;) { - for (cnt = 0; cnt < 2; ++cnt) { - src0 = LD_SB(&src[cnt << 5]); - src2 = LD_SB(&src[16 + (cnt << 5)]); - src3 = LD_SB(&src[24 + (cnt << 5)]); - src1 = __msa_sldi_b(src2, src0, 8); - - XORI_B4_128_SB(src0, src1, src2, src3); - VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8, - vec12); - VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9, - vec13); - VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10, - vec14); - VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11, - vec15); - DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, - vec1, vec2, vec3); - DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8, - vec9, vec10, vec11); - DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0, - vec1, vec2, vec3); - DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3, vec8, - vec9, vec10, vec11); - ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0, out1, - out2, out3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - LD_UB2(&dst[cnt << 5], 16, dst1, dst2); - PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]); - PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]); - } - - src += src_stride; - dst += dst_stride; - } -} - -static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1; - v8u16 vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); - SRARI_H2_UH(vec2, vec3, FILTER_BITS); - PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v8u16 vec4, vec5, vec6, vec7, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1); - VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5, - vec6, vec7); - SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); - PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, - res3); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - if (4 == height) { - common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; - v8u16 vec0, vec1, vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); -} - -static void common_hz_2t_and_aver_dst_8x8mult_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - v16i8 src0, src1, src2, src3, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; - v8u16 vec0, vec1, vec2, vec3, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - - if (16 == height) { - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - LD_SB4(src, src_stride, src0, src1, src2, src3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - - VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1, - vec2, vec3); - SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst, - dst_stride); - } -} - -static void common_hz_2t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - if (4 == height) { - common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); - } else { - common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, - filter, height); - } -} - -static void common_hz_2t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, - res2, res3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, - res6, res7); - SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); - SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST_UB(res1, res0, dst0, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res3, res2, dst1, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res5, res4, dst2, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res7, res6, dst3, dst); - dst += dst_stride; - - for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, - res2, res3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, - res6, res7); - SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); - SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST_UB(res1, res0, dst0, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res3, res2, dst1, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res5, res4, dst2, dst); - dst += dst_stride; - PCKEV_AVG_ST_UB(res7, res6, dst3, dst); - dst += dst_stride; - } -} - -static void common_hz_2t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - for (loop_cnt = (height >> 1); loop_cnt--;) { - src0 = LD_SB(src); - src2 = LD_SB(src + 16); - src3 = LD_SB(src + 24); - src1 = __msa_sldi_b(src2, src0, 8); - src += src_stride; - src4 = LD_SB(src); - src6 = LD_SB(src + 16); - src7 = LD_SB(src + 24); - src5 = __msa_sldi_b(src6, src4, 8); - src += src_stride; - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1, - res2, res3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5, - res6, res7); - SRARI_H4_UH(res0, res1, res2, res3, FILTER_BITS); - SRARI_H4_UH(res4, res5, res6, res7, FILTER_BITS); - LD_UB2(dst, 16, dst0, dst1); - PCKEV_AVG_ST_UB(res1, res0, dst0, dst); - PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16)); - dst += dst_stride; - LD_UB2(dst, 16, dst2, dst3); - PCKEV_AVG_ST_UB(res5, res4, dst2, dst); - PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16)); - dst += dst_stride; - } -} - -static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt0, dst0, dst1, dst2, dst3; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - for (loop_cnt = height; loop_cnt--;) { - LD_SB4(src, 16, src0, src2, src4, src6); - src7 = LD_SB(src + 56); - SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8); - src += src_stride; - - VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1); - VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3); - VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5); - VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1, - out2, out3); - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5, - out6, out7); - SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS); - SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS); - LD_UB4(dst, 16, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST_UB(out1, out0, dst0, dst); - PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16); - PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32); - PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48); - dst += dst_stride; - } -} - -void aom_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int8_t cnt, filt_hor[8]; - - assert(x_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - - for (cnt = 0; cnt < 8; ++cnt) { - filt_hor[cnt] = filter_x[cnt]; - } - - if (((const int32_t *)filter_x)[0] == 0) { - switch (w) { - case 4: - common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], h); - break; - case 8: - common_hz_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], h); - break; - case 16: - common_hz_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], h); - break; - case 32: - common_hz_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], h); - break; - case 64: - common_hz_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], h); - break; - default: - aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } else { - switch (w) { - case 4: - common_hz_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, h); - break; - case 8: - common_hz_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, h); - break; - case 16: - common_hz_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, h); - break; - case 32: - common_hz_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, h); - break; - case 64: - common_hz_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, h); - break; - default: - aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c deleted file mode 100644 index bed600d5b..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_msa.c +++ /dev/null @@ -1,605 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/aom_convolve_msa.h" - -static void common_hv_8ht_8vt_and_aver_dst_4w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1; - v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; - v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4; - v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= (3 + 3 * src_stride); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); - - filt = LD_SH(filter_vert); - SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - - ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); - vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); - res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); - vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); - res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - - SRARI_H2_SH(res0, res1, FILTER_BITS); - SAT_SH2_SH(res0, res1, 7); - PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1); - XORI_B2_128_UB(tmp0, tmp1); - AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1); - ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - - hz_out5 = hz_out9; - vec0 = vec2; - vec1 = vec3; - vec2 = vec4; - } -} - -static void common_hv_8ht_8vt_and_aver_dst_8w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; - v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3; - v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; - v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= (3 + 3 * src_stride); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - - filt = LD_SH(filter_vert); - SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - - ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); - ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); - ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - - hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); - tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); - tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); - tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); - out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); - tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); - CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - - hz_out6 = hz_out10; - out0 = out2; - out1 = out3; - out2 = out8; - out4 = out6; - out5 = out7; - out6 = out9; - } -} - -static void common_hv_8ht_8vt_and_aver_dst_16w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 2; multiple8_cnt--;) { - common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_8ht_8vt_and_aver_dst_32w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 4; multiple8_cnt--;) { - common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_8ht_8vt_and_aver_dst_64w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 8; multiple8_cnt--;) { - common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_2ht_2vt_and_aver_dst_4x4_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, vec0, vec1; - v16u8 dst0, dst1, dst2, dst3, res0, res1; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); - - filt = LD_UH(filter_vert); - filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); - hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); - AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hv_2ht_2vt_and_aver_dst_4x8_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; - v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - src8 = LD_SB(src); - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); - hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); - hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); - SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, - hz_out3, hz_out5, 8); - hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); - - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4, - dst6); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0, - tmp1, tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2, - res3); - AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hv_2ht_2vt_and_aver_dst_4w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - if (4 == height) { - common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert); - } else if (8 == height) { - common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert); - } -} - -static void common_hv_2ht_2vt_and_aver_dst_8x4_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; - v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - src += (5 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec1, filt_vt); - - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec2, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec3, filt_vt); - - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); -} - -static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3; - v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_SB(src); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - } -} - -static void common_hv_2ht_2vt_and_aver_dst_8w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - if (4 == height) { - common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert); - } else { - common_hv_2ht_2vt_and_aver_dst_8x8mult_msa( - src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height); - } -} - -static void common_hv_2ht_2vt_and_aver_dst_16w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - LD_SB2(src, 8, src0, src1); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); - dst += dst_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst); - dst += dst_stride; - - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); - dst += dst_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst); - dst += dst_stride; - } -} - -static void common_hv_2ht_2vt_and_aver_dst_32w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 2; multiple8_cnt--;) { - common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - src += 16; - dst += 16; - } -} - -static void common_hv_2ht_2vt_and_aver_dst_64w_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 4; multiple8_cnt--;) { - common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - src += 16; - dst += 16; - } -} - -void aom_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int8_t cnt, filt_hor[8], filt_ver[8]; - - assert(x_step_q4 == 16); - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - for (cnt = 0; cnt < 8; ++cnt) { - filt_hor[cnt] = filter_x[cnt]; - filt_ver[cnt] = filter_y[cnt]; - } - - if (((const int32_t *)filter_x)[0] == 0 && - ((const int32_t *)filter_y)[0] == 0) { - switch (w) { - case 4: - common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], h); - break; - case 8: - common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], h); - break; - case 16: - common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], h); - break; - case 32: - common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], h); - break; - case 64: - common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, - &filt_hor[3], &filt_ver[3], h); - break; - default: - aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } else if (((const int32_t *)filter_x)[0] == 0 || - ((const int32_t *)filter_y)[0] == 0) { - aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - } else { - switch (w) { - case 4: - common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, - filt_ver, h); - break; - case 8: - common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, - filt_ver, h); - break; - case 16: - common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, - filt_ver, h); - break; - case 32: - common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, - filt_ver, h); - break; - case 64: - common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, - filt_ver, h); - break; - default: - aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c deleted file mode 100644 index dae771104..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_avg_vert_msa.c +++ /dev/null @@ -1,677 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/aom_convolve_msa.h" - -static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3, out; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776; - v16i8 src10998, filt0, filt1, filt2, filt3; - v8i16 filt, out10, out32; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110, - src4332, src6554); - XORI_B3_128_SB(src2110, src4332, src6554); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998); - XORI_B2_128_SB(src8776, src10998); - out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0, - filt1, filt2, filt3); - out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0, - filt1, filt2, filt3); - SRARI_H2_SH(out10, out32, FILTER_BITS); - SAT_SH2_SH(out10, out32, 7); - out = PCKEV_XORI128_UB(out10, out32); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2); - - dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0); - out = __msa_aver_u_b(out, dst0); - - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - - src2110 = src6554; - src4332 = src8776; - src6554 = src10998; - src6 = src10; - } -} - -static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16u8 dst0, dst1, dst2, dst3; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3; - v8i16 filt, out0, out1, out2, out3; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - XORI_B4_128_SB(src7, src8, src9, src10); - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1, - filt2, filt3); - out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1, - filt2, filt3); - out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1, - filt2, filt3); - out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, - filt1, filt2, filt3); - SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS); - SAT_SH4_SH(out0, out1, out2, out3, 7); - CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst, - dst_stride); - dst += (4 * dst_stride); - - src10_r = src54_r; - src32_r = src76_r; - src54_r = src98_r; - src21_r = src65_r; - src43_r = src87_r; - src65_r = src109_r; - src6 = src10; - } -} - -static void common_vt_8t_and_aver_dst_16w_mult_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height, int32_t width) { - const uint8_t *src_tmp; - uint8_t *dst_tmp; - uint32_t loop_cnt, cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r; - v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l; - v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l; - v16i8 filt0, filt1, filt2, filt3; - v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; - v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt; - - src -= (3 * src_stride); - - filt = LD_SH(filter); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3); - - for (cnt = (width >> 4); cnt--;) { - src_tmp = src; - dst_tmp = dst; - - LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src_tmp += (7 * src_stride); - - ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r, - src54_r, src21_r); - ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r); - ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l, - src54_l, src21_l); - ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src_tmp, src_stride, src7, src8, src9, src10); - src_tmp += (4 * src_stride); - - LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3); - XORI_B4_128_SB(src7, src8, src9, src10); - ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r, - src87_r, src98_r, src109_r); - ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l, - src87_l, src98_l, src109_l); - out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, - filt1, filt2, filt3); - out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, - filt1, filt2, filt3); - out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, - filt1, filt2, filt3); - out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0, - filt1, filt2, filt3); - out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0, - filt1, filt2, filt3); - out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0, - filt1, filt2, filt3); - out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0, - filt1, filt2, filt3); - out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0, - filt1, filt2, filt3); - SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS); - SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS); - SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); - SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7); - PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, - out3_r, tmp0, tmp1, tmp2, tmp3); - XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3); - AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1, - dst2, dst3); - ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride); - dst_tmp += (4 * dst_stride); - - src10_r = src54_r; - src32_r = src76_r; - src54_r = src98_r; - src21_r = src65_r; - src43_r = src87_r; - src65_r = src109_r; - src10_l = src54_l; - src32_l = src76_l; - src54_l = src98_l; - src21_l = src65_l; - src43_l = src87_l; - src65_l = src109_l; - src6 = src10; - } - - src += 16; - dst += 16; - } -} - -static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, - filter, height, 16); -} - -static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, - filter, height, 32); -} - -static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride, - filter, height, 64); -} - -static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16i8 src0, src1, src2, src3, src4; - v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332; - v16i8 src10_r, src32_r, src21_r, src43_r; - v8i16 filt; - v8u16 tmp0, tmp1; - - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_SB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - src4 = LD_SB(src); - src += src_stride; - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1); - dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0); - ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332); - DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - - out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); - out = __msa_aver_u_b(out, dst0); - - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); -} - -static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r; - v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; - v16u8 src2110, src4332, src6554, src8776, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - src8 = LD_SB(src); - - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2, - dst3); - ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1); - ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r, - src32_r, src43_r); - ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r, - src76_r, src87_r); - ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r, - src76_r, src2110, src4332, src6554, src8776); - DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0, - tmp0, tmp1, tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); - AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride); -} - -static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - if (4 == height) { - common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter); - } else if (8 == height) { - common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter); - } -} - -static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter) { - v16u8 src0, src1, src2, src3, src4; - v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - LD_UB5(src, src_stride, src0, src1, src2, src3, src4); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1); - ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst, - dst_stride); -} - -static void common_vt_2t_and_aver_dst_8x8mult_msa( - const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; - v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - /* rearranging filter_y */ - filt = LD_SH(filter); - filt0 = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8); - src += (8 * src_stride); - LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8); - - ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2, - vec3); - ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6, - vec7); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst, - dst_stride); - dst += (4 * dst_stride); - - DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, - tmp2, tmp3); - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst, - dst_stride); - dst += (4 * dst_stride); - - src0 = src8; - } -} - -static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int8_t *filter, - int32_t height) { - if (4 == height) { - common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter); - } else { - common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride, - filter, height); - } -} - -static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 tmp0, tmp1, tmp2, tmp3, filt; - - /* rearranging filter_y */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - src0 = LD_UB(src); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); - dst += dst_stride; - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst); - dst += dst_stride; - - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst); - dst += dst_stride; - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst); - dst += dst_stride; - - src0 = src4; - } -} - -static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0; - v8u16 tmp0, tmp1, tmp2, tmp3, filt; - - /* rearranging filter_y */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_UB2(src, 16, src0, src5); - src += src_stride; - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_UB4(src, src_stride, src1, src2, src3, src4); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - - LD_UB4(src + 16, src_stride, src6, src7, src8, src9); - LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7); - src += (4 * src_stride); - - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); - - ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6); - ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride); - - ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2); - ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride); - - ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6); - ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride); - dst += (4 * dst_stride); - - src0 = src4; - src5 = src9; - } -} - -static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter, int32_t height) { - uint32_t loop_cnt; - v16u8 src0, src1, src2, src3, src4, src5; - v16u8 src6, src7, src8, src9, src10, src11, filt0; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8u16 filt; - - /* rearranging filter_y */ - filt = LD_UH(filter); - filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_UB4(src, 16, src0, src3, src6, src9); - src += src_stride; - - for (loop_cnt = (height >> 1); loop_cnt--;) { - LD_UB2(src, src_stride, src1, src2); - LD_UB2(dst, dst_stride, dst0, dst1); - LD_UB2(src + 16, src_stride, src4, src5); - LD_UB2(dst + 16, dst_stride, dst2, dst3); - LD_UB2(src + 32, src_stride, src7, src8); - LD_UB2(dst + 32, dst_stride, dst4, dst5); - LD_UB2(src + 48, src_stride, src10, src11); - LD_UB2(dst + 48, dst_stride, dst6, dst7); - src += (2 * src_stride); - - ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2); - ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride); - - ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6); - ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); - SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); - SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride); - - ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2); - ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3); - DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32); - - DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3); - SRARI_H2_UH(tmp2, tmp3, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride); - - ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6); - ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7); - DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5); - SRARI_H2_UH(tmp4, tmp5, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48)); - - DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7); - SRARI_H2_UH(tmp6, tmp7, FILTER_BITS); - PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride); - dst += (2 * dst_stride); - - src0 = src2; - src3 = src5; - src6 = src8; - src9 = src11; - } -} - -void aom_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - int8_t cnt, filt_ver[8]; - - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - for (cnt = 0; cnt < 8; ++cnt) { - filt_ver[cnt] = filter_y[cnt]; - } - - if (((const int32_t *)filter_y)[0] == 0) { - switch (w) { - case 4: - common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_ver[3], h); - break; - case 8: - common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_ver[3], h); - break; - case 16: - common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_ver[3], h); - break; - case 32: - common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_ver[3], h); - break; - case 64: - common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_ver[3], h); - break; - default: - aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } else { - switch (w) { - case 4: - common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_ver, h); - break; - case 8: - common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_ver, h); - break; - case 16: - common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_ver, h); - - break; - case 32: - common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_ver, h); - break; - case 64: - common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_ver, h); - break; - default: - aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c index fc3a823c5..363fad308 100644 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c @@ -10,7 +10,9 @@ */ #include -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/aom_convolve_msa.h" static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c deleted file mode 100644 index a4d594931..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_msa.c +++ /dev/null @@ -1,630 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/aom_convolve_msa.h" - -const uint8_t mc_filt_mask_arr[16 * 3] = { - /* 8 width cases */ - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - /* 4 width cases */ - 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, - /* 4 width cases */ - 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28 -}; - -static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; - v16u8 mask0, mask1, mask2, mask3, out; - v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4; - v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - - mask0 = LD_UB(&mc_filt_mask_arr[16]); - src -= (3 + 3 * src_stride); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8); - - filt = LD_SH(filter_vert); - SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - - ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); - out2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - XORI_B4_128_SB(src7, src8, src9, src10); - src += (4 * src_stride); - - hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8); - out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); - tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8); - out4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); - tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - SRARI_H2_SH(tmp0, tmp1, FILTER_BITS); - SAT_SH2_SH(tmp0, tmp1, 7); - out = PCKEV_XORI128_UB(tmp0, tmp1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); - - hz_out5 = hz_out9; - out0 = out2; - out1 = out3; - out2 = out4; - } -} - -static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; - v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3; - v16u8 mask0, mask1, mask2, mask3, vec0, vec1; - v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3; - v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3; - v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9; - - mask0 = LD_UB(&mc_filt_mask_arr[0]); - src -= (3 + 3 * src_stride); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3); - - mask1 = mask0 + 2; - mask2 = mask0 + 4; - mask3 = mask0 + 6; - - LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6); - src += (7 * src_stride); - - XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6); - hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - - filt = LD_SH(filter_vert); - SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3); - - ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1); - ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4); - ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src7, src8, src9, src10); - src += (4 * src_stride); - - XORI_B4_128_SB(src7, src8, src9, src10); - - hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6); - tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7); - tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0, - filt_hz1, filt_hz2, filt_hz3); - out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8); - tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - - hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, - filt_hz0, filt_hz1, filt_hz2, filt_hz3); - out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9); - tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1, - filt_vt2, filt_vt3); - SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); - vec0 = PCKEV_XORI128_UB(tmp0, tmp1); - vec1 = PCKEV_XORI128_UB(tmp2, tmp3); - ST8x4_UB(vec0, vec1, dst, dst_stride); - dst += (4 * dst_stride); - - hz_out6 = hz_out10; - out0 = out2; - out1 = out3; - out2 = out8; - out4 = out6; - out5 = out7; - out6 = out9; - } -} - -static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 2; multiple8_cnt--;) { - common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 4; multiple8_cnt--;) { - common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 8; multiple8_cnt--;) { - common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert, height); - src += 8; - dst += 8; - } -} - -static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, mask; - v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); - - filt = LD_UH(filter_vert); - filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8); - hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2); - - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1); - SRARI_H2_UH(tmp0, tmp1, FILTER_BITS); - PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask; - v16i8 res0, res1, res2, res3; - v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; - v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6; - v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt; - - mask = LD_SB(&mc_filt_mask_arr[16]); - - /* rearranging filter */ - filt = LD_UH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0); - - filt = LD_UH(filter_vert); - filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0); - - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - src8 = LD_SB(src); - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS); - hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS); - hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS); - hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS); - SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1, - hz_out3, hz_out5, 8); - hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6); - - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3); - DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4, - vec5, vec6, vec7); - SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS); - PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, - res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); -} - -static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - if (4 == height) { - common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert); - } else if (8 == height) { - common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert); - } -} - -static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert) { - v16i8 src0, src1, src2, src3, src4, mask, out0, out1; - v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3; - v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - LD_SB5(src, src_stride, src0, src1, src2, src3, src4); - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp0 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp1 = __msa_dotp_u_h(vec1, filt_vt); - - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp2 = __msa_dotp_u_h(vec2, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp3 = __msa_dotp_u_h(vec3, filt_vt); - - SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS); - PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); -} - -static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - int8_t *filter_horiz, - int8_t *filter_vert, int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, mask, out0, out1; - v16u8 filt_hz, filt_vt, vec0; - v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - src0 = LD_SB(src); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 3); loop_cnt--;) { - LD_SB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp1 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp2 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); - - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp3 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - LD_SB4(src, src_stride, src1, src2, src3, src4); - src += (4 * src_stride); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp4 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H2_UH(tmp3, tmp4, FILTER_BITS); - PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp5 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp6 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0); - tmp7 = __msa_dotp_u_h(vec0, filt_vt); - - hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1); - tmp8 = __msa_dotp_u_h(vec0, filt_vt); - - SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, FILTER_BITS); - PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - if (4 == height) { - common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert); - } else { - common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride, - filter_horiz, filter_vert, height); - } -} - -static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask; - v16u8 filt_hz, filt_vt, vec0, vec1; - v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3; - v8i16 filt; - - mask = LD_SB(&mc_filt_mask_arr[0]); - - /* rearranging filter */ - filt = LD_SH(filter_horiz); - filt_hz = (v16u8)__msa_splati_h(filt, 0); - - filt = LD_SH(filter_vert); - filt_vt = (v16u8)__msa_splati_h(filt, 0); - - LD_SB2(src, 8, src0, src1); - src += src_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - - for (loop_cnt = (height >> 2); loop_cnt--;) { - LD_SB4(src, src_stride, src0, src2, src4, src6); - LD_SB4(src + 8, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - - hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); - SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); - PCKEV_ST_SB(tmp1, tmp2, dst); - dst += dst_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); - SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); - PCKEV_ST_SB(tmp1, tmp2, dst); - dst += dst_stride; - - hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS); - hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); - SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); - PCKEV_ST_SB(tmp1, tmp2, dst); - dst += dst_stride; - - hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS); - hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS); - ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1); - DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2); - SRARI_H2_UH(tmp1, tmp2, FILTER_BITS); - PCKEV_ST_SB(tmp1, tmp2, dst); - dst += dst_stride; - } -} - -static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 2; multiple8_cnt--;) { - common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert, height); - src += 16; - dst += 16; - } -} - -static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - int8_t *filter_horiz, int8_t *filter_vert, - int32_t height) { - int32_t multiple8_cnt; - for (multiple8_cnt = 4; multiple8_cnt--;) { - common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz, - filter_vert, height); - src += 16; - dst += 16; - } -} - -void aom_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int32_t x_step_q4, const int16_t *filter_y, - int32_t y_step_q4, int32_t w, int32_t h) { - int8_t cnt, filt_hor[8], filt_ver[8]; - - assert(x_step_q4 == 16); - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - for (cnt = 0; cnt < 8; ++cnt) { - filt_hor[cnt] = filter_x[cnt]; - filt_ver[cnt] = filter_y[cnt]; - } - - if (((const int32_t *)filter_x)[0] == 0 && - ((const int32_t *)filter_y)[0] == 0) { - switch (w) { - case 4: - common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], (int32_t)h); - break; - case 8: - common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], (int32_t)h); - break; - case 16: - common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], (int32_t)h); - break; - case 32: - common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], (int32_t)h); - break; - case 64: - common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, &filt_hor[3], - &filt_ver[3], (int32_t)h); - break; - default: - aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - break; - } - } else if (((const int32_t *)filter_x)[0] == 0 || - ((const int32_t *)filter_y)[0] == 0) { - aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - } else { - switch (w) { - case 4: - common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, filt_ver, - (int32_t)h); - break; - case 8: - common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, filt_ver, - (int32_t)h); - break; - case 16: - common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, filt_ver, - (int32_t)h); - break; - case 32: - common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, filt_ver, - (int32_t)h); - break; - case 64: - common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride, dst, - (int32_t)dst_stride, filt_hor, filt_ver, - (int32_t)h); - break; - default: - aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c index f7bdfc2bd..aa962b41f 100644 --- a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c +++ b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c @@ -10,7 +10,9 @@ */ #include -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/aom_convolve_msa.h" static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c deleted file mode 100644 index 75f8c7ea8..000000000 --- a/third_party/aom/aom_dsp/mips/aom_convolve_avg_msa.c +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/macros_msa.h" - -static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int32_t height) { - int32_t cnt; - uint32_t out0, out1, out2, out3; - v16u8 src0, src1, src2, src3; - v16u8 dst0, dst1, dst2, dst3; - - if (0 == (height % 4)) { - for (cnt = (height / 4); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, - dst2, dst3); - - out0 = __msa_copy_u_w((v4i32)dst0, 0); - out1 = __msa_copy_u_w((v4i32)dst1, 0); - out2 = __msa_copy_u_w((v4i32)dst2, 0); - out3 = __msa_copy_u_w((v4i32)dst3, 0); - SW4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - } - } else if (0 == (height % 2)) { - for (cnt = (height / 2); cnt--;) { - LD_UB2(src, src_stride, src0, src1); - src += (2 * src_stride); - - LD_UB2(dst, dst_stride, dst0, dst1); - - AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1); - - out0 = __msa_copy_u_w((v4i32)dst0, 0); - out1 = __msa_copy_u_w((v4i32)dst1, 0); - SW(out0, dst); - dst += dst_stride; - SW(out1, dst); - dst += dst_stride; - } - } -} - -static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, - int32_t dst_stride, int32_t height) { - int32_t cnt; - uint64_t out0, out1, out2, out3; - v16u8 src0, src1, src2, src3; - v16u8 dst0, dst1, dst2, dst3; - - for (cnt = (height / 4); cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, - dst2, dst3); - - out0 = __msa_copy_u_d((v2i64)dst0, 0); - out1 = __msa_copy_u_d((v2i64)dst1, 0); - out2 = __msa_copy_u_d((v2i64)dst2, 0); - out3 = __msa_copy_u_d((v2i64)dst3, 0); - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void avg_width16_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - - for (cnt = (height / 8); cnt--;) { - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7); - - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, - dst2, dst3); - AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, - dst6, dst7); - ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride); - dst += (8 * dst_stride); - } -} - -static void avg_width32_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - uint8_t *dst_dup = dst; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 src8, src9, src10, src11, src12, src13, src14, src15; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; - - for (cnt = (height / 8); cnt--;) { - LD_UB4(src, src_stride, src0, src2, src4, src6); - LD_UB4(src + 16, src_stride, src1, src3, src5, src7); - src += (4 * src_stride); - LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6); - LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7); - dst_dup += (4 * dst_stride); - LD_UB4(src, src_stride, src8, src10, src12, src14); - LD_UB4(src + 16, src_stride, src9, src11, src13, src15); - src += (4 * src_stride); - LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14); - LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15); - dst_dup += (4 * dst_stride); - - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, - dst2, dst3); - AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, - dst6, dst7); - AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, - dst10, dst11); - AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, - dst13, dst14, dst15); - - ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride); - ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride); - dst += (4 * dst_stride); - ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride); - ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride); - dst += (4 * dst_stride); - } -} - -static void avg_width64_msa(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, int32_t height) { - int32_t cnt; - uint8_t *dst_dup = dst; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v16u8 src8, src9, src10, src11, src12, src13, src14, src15; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; - - for (cnt = (height / 4); cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(src, 16, src4, src5, src6, src7); - src += src_stride; - LD_UB4(src, 16, src8, src9, src10, src11); - src += src_stride; - LD_UB4(src, 16, src12, src13, src14, src15); - src += src_stride; - - LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3); - dst_dup += dst_stride; - LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7); - dst_dup += dst_stride; - LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11); - dst_dup += dst_stride; - LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15); - dst_dup += dst_stride; - - AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1, - dst2, dst3); - AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5, - dst6, dst7); - AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11, dst8, dst9, - dst10, dst11); - AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15, dst12, - dst13, dst14, dst15); - - ST_UB4(dst0, dst1, dst2, dst3, dst, 16); - dst += dst_stride; - ST_UB4(dst4, dst5, dst6, dst7, dst, 16); - dst += dst_stride; - ST_UB4(dst8, dst9, dst10, dst11, dst, 16); - dst += dst_stride; - ST_UB4(dst12, dst13, dst14, dst15, dst, 16); - dst += dst_stride; - } -} - -void aom_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, - int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - - switch (w) { - case 4: { - avg_width4_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 8: { - avg_width8_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 16: { - avg_width16_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 32: { - avg_width32_msa(src, src_stride, dst, dst_stride, h); - break; - } - case 64: { - avg_width64_msa(src, src_stride, dst, dst_stride, h); - break; - } - default: { - int32_t lp, cnt; - for (cnt = h; cnt--;) { - for (lp = 0; lp < w; ++lp) { - dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1); - } - src += src_stride; - dst += dst_stride; - } - break; - } - } -} diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h index 1a0ae4d8d..a0627c074 100644 --- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h +++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h @@ -31,23 +31,6 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; tmp_dpadd_0; \ }) -#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ - filt_h1, filt_h2, filt_h3) \ - ({ \ - v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ - v8i16 hz_out_m; \ - \ - VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, vec0_m, vec1_m, vec2_m, \ - vec3_m); \ - hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, filt_h0, \ - filt_h1, filt_h2, filt_h3); \ - \ - hz_out_m = __msa_srari_h(hz_out_m, FILTER_BITS); \ - hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ - \ - hz_out_m; \ - }) - #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \ mask2, mask3, filt0, filt1, filt2, filt3, \ out0, out1) \ @@ -93,32 +76,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; res7_m, out0, out1, out2, out3); \ } -#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ - { \ - v16u8 tmp_m; \ - \ - tmp_m = PCKEV_XORI128_UB(in1, in0); \ - tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ - ST_UB(tmp_m, (pdst)); \ - } - -#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ - { \ - v16u8 tmp_m; \ - \ - tmp_m = (v16u8)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ - tmp_m = __msa_aver_u_b(tmp_m, (v16u8)dst); \ - ST_UB(tmp_m, (pdst)); \ - } - -#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \ - stride) \ - { \ - v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ - PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ - AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ - } #endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */ diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h index 31159fdcd..d51bfa899 100644 --- a/third_party/aom/aom_dsp/mips/common_dspr2.h +++ b/third_party/aom/aom_dsp/mips/common_dspr2.h @@ -13,7 +13,9 @@ #define AOM_COMMON_MIPS_DSPR2_H_ #include -#include "./aom_config.h" + +#include "config/aom_config.h" + #include "aom/aom_integer.h" #ifdef __cplusplus diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c deleted file mode 100644 index d557115b9..000000000 --- a/third_party/aom/aom_dsp/mips/convolve2_avg_dspr2.c +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t w, - int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t Temp1, Temp2; - const int16_t *filter = &filter_y[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - - for (x = 0; x < w; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" - - "extp %[Temp1], $ac0, 31 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "lbu %[scratch1], 0(%[dst_ptr]) \n\t" - "lbu %[scratch2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ - "extp %[Temp2], $ac3, 31 \n\t" - "lbu %[scratch1], 2(%[dst_ptr]) \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - "lbu %[scratch2], 3(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), - [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), - [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_y, int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2; - uint32_t p1, p2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t Temp1, Temp2; - const int16_t *filter = &filter_y[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - prefetch_store(dst + dst_stride + 32); - - for (x = 0; x < 64; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" - - "extp %[Temp1], $ac0, 31 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "lbu %[scratch1], 0(%[dst_ptr]) \n\t" - "lbu %[scratch2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ - "extp %[Temp2], $ac3, 31 \n\t" - "lbu %[scratch1], 2(%[dst_ptr]) \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - "lbu %[scratch2], 3(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1), - [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), - [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - uint32_t pos = 38; - - assert(y_step_q4 == 16); - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - prefetch_store(dst); - - switch (w) { - case 4: - case 8: - case 16: - case 32: - convolve_bi_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, - w, h); - break; - case 64: - prefetch_store(dst + 32); - convolve_bi_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, - h); - break; - default: - aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c deleted file mode 100644 index efbdcf60f..000000000 --- a/third_party/aom/aom_dsp/mips/convolve2_avg_horiz_dspr2.c +++ /dev/null @@ -1,802 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - int32_t Temp1, Temp2, Temp3, Temp4; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2, p3; - uint32_t tn1, tn2; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p3], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ - "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" - "extp %[Temp4], $ac2, 31 \n\t" - - "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ - "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ - - /* clamp */ - "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ - "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ - "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ - - "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ - "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ - - "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ - "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [Temp4] "=&r"(Temp4) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2, tp3, tp4; - uint32_t p1, p2, p3, p4, n1; - uint32_t st0, st1; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - "lbu %[Temp2], 0(%[dst]) \n\t" - "lbu %[tp4], 2(%[dst]) \n\t" - - /* even 2. pixel */ - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lbux %[st1], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" - "addqh_r.w %[tp4], %[tp4], %[st1] \n\t" - "sb %[Temp2], 0(%[dst]) \n\t" - "sb %[tp4], 2(%[dst]) \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "balign %[tp3], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "lbu %[Temp2], 4(%[dst]) \n\t" - "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" - - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sb %[Temp2], 4(%[dst]) \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tp3] \n\t" - "preceu.ph.qbl %[p4], %[tp3] \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tp1], 6(%[dst]) \n\t" - - /* odd 2. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "lbux %[st0], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - "lbu %[tp2], 1(%[dst]) \n\t" - "lbu %[tp3], 3(%[dst]) \n\t" - "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" - - /* odd 3. pixel */ - "lbux %[st1], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" - "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tp4], 5(%[dst]) \n\t" - - /* odd 4. pixel */ - "sb %[tp2], 1(%[dst]) \n\t" - "sb %[tp1], 6(%[dst]) \n\t" - "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbu %[tp1], 7(%[dst]) \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "addqh_r.w %[tp3], %[tp3], %[p4] \n\t" - - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[tp4], %[tp4], %[p2] \n\t" - - "lbux %[p1], %[Temp1](%[cm]) \n\t" - "addqh_r.w %[tp1], %[tp1], %[p1] \n\t" - - /* store bytes */ - "sb %[tp3], 3(%[dst]) \n\t" - "sb %[tp4], 5(%[dst]) \n\t" - "sb %[tp1], 7(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), - [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h, - int32_t count) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_store(dst_ptr + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ - "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ - "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ - "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ - "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ - "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ - "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ - "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ - "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ - "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ - "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ - "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ - "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ - "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ - "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ - "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ - "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ - "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ - "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ - "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ - "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ - - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ - - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ - - "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ - "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ - "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), - [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), - [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), - [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - const int16_t *filter = &filter_x0[3]; - uint32_t filter45; - - filter45 = ((const int32_t *)filter)[0]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - prefetch_store(dst_ptr + dst_stride); - prefetch_store(dst_ptr + dst_stride + 32); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ - "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ - "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ - "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ - "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ - "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ - "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ - "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ - "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ - "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ - "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ - "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ - "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ - "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ - "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ - "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ - "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ - "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ - "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ - "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ - "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ - "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ - "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ - "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ - - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ - - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ - - "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ - "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ - "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), - [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), - [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), - [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3) - : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), - [dst] "r"(dst), [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - uint32_t pos = 38; - - assert(x_step_q4 == 16); - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: - convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - case 8: - convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - case 16: - convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, - h, 1); - break; - case 32: - convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, - h, 2); - break; - case 64: - prefetch_load(src + 64); - prefetch_store(dst + 32); - - convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - default: - aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c index 066308315..08bf1ab30 100644 --- a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c @@ -12,7 +12,8 @@ #include #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c index dc51ab1cb..2a8f75938 100644 --- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c @@ -12,7 +12,8 @@ #include #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c index 3367be01a..ac87936da 100644 --- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c @@ -12,7 +12,8 @@ #include #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c deleted file mode 100644 index 3574da19f..000000000 --- a/third_party/aom/aom_dsp/mips/convolve8_avg_dspr2.c +++ /dev/null @@ -1,646 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t w, - int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2, load3, load4; - uint32_t p1, p2; - uint32_t n1, n2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2; - - vector1b = ((const int32_t *)filter_y)[0]; - vector2b = ((const int32_t *)filter_y)[1]; - vector3b = ((const int32_t *)filter_y)[2]; - vector4b = ((const int32_t *)filter_y)[3]; - - src -= 3 * src_stride; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - - for (x = 0; x < w; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac0, 31 \n\t" - "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "lbu %[scratch1], 0(%[dst_ptr]) \n\t" - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - "lbu %[scratch2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ - "extp %[Temp2], $ac3, 31 \n\t" - "lbu %[scratch1], 2(%[dst_ptr]) \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - "lbu %[scratch2], 3(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), - [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), - [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_y, int32_t h) { - int32_t x, y; - const uint8_t *src_ptr; - uint8_t *dst_ptr; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - uint32_t load1, load2, load3, load4; - uint32_t p1, p2; - uint32_t n1, n2; - uint32_t scratch1, scratch2; - uint32_t store1, store2; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2; - - vector1b = ((const int32_t *)filter_y)[0]; - vector2b = ((const int32_t *)filter_y)[1]; - vector3b = ((const int32_t *)filter_y)[2]; - vector4b = ((const int32_t *)filter_y)[3]; - - src -= 3 * src_stride; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_store(dst + dst_stride); - prefetch_store(dst + dst_stride + 32); - - for (x = 0; x < 64; x += 4) { - src_ptr = src + x; - dst_ptr = dst + x; - - __asm__ __volatile__( - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "mtlo %[vector4a], $ac0 \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac0 \n\t" - "mthi $zero, $ac1 \n\t" - "mthi $zero, $ac2 \n\t" - "mthi $zero, $ac3 \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load1], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load2], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load3], 0(%[src_ptr]) \n\t" - "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" - "ulw %[load4], 0(%[src_ptr]) \n\t" - - "preceu.ph.qbr %[scratch1], %[load1] \n\t" - "preceu.ph.qbr %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "preceu.ph.qbr %[scratch2], %[load3] \n\t" - "preceu.ph.qbr %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - - "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac0, 31 \n\t" - "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - "preceu.ph.qbl %[scratch1], %[load1] \n\t" - "preceu.ph.qbl %[p1], %[load2] \n\t" - "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ - "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ - "lbu %[scratch1], 0(%[dst_ptr]) \n\t" - "preceu.ph.qbl %[scratch2], %[load3] \n\t" - "preceu.ph.qbl %[p2], %[load4] \n\t" - "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ - "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ - "lbu %[scratch2], 1(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ - "extp %[Temp1], $ac2, 31 \n\t" - - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ - "extp %[Temp2], $ac3, 31 \n\t" - "lbu %[scratch1], 2(%[dst_ptr]) \n\t" - - "sb %[store1], 0(%[dst_ptr]) \n\t" - "sb %[store2], 1(%[dst_ptr]) \n\t" - "lbu %[scratch2], 3(%[dst_ptr]) \n\t" - - "lbux %[store1], %[Temp1](%[cm]) \n\t" - "lbux %[store2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ - "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ - - "sb %[store1], 2(%[dst_ptr]) \n\t" - "sb %[store2], 3(%[dst_ptr]) \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), - [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), - [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), - [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), - [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); - } - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -void aom_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - if (((const int32_t *)filter_y)[0] == 0) { - aom_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - } else { - uint32_t pos = 38; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - prefetch_store(dst); - - switch (w) { - case 4: - case 8: - case 16: - case 32: - convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, - h); - break; - case 64: - prefetch_store(dst + 32); - convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, - h); - break; - default: - aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - break; - } - } -} - -void aom_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - /* Fixed size intermediate buffer places limits on parameters. */ - DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); - int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; - - assert(w <= 64); - assert(h <= 64); - assert(x_step_q4 == 16); - assert(y_step_q4 == 16); - - if (intermediate_height < h) intermediate_height = h; - - aom_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, intermediate_height); - - aom_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); -} - -void aom_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, - int h) { - int x, y; - uint32_t tp1, tp2, tn1; - uint32_t tp3, tp4, tn2; - - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: - /* 1 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 0(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - - : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - break; - case 8: - /* 2 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 0(%[dst]) \n\t" - "ulw %[tp3], 4(%[src]) \n\t" - "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - break; - case 16: - /* 4 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 0(%[dst]) \n\t" - "ulw %[tp3], 4(%[src]) \n\t" - "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 8(%[src]) \n\t" - "ulw %[tp2], 8(%[dst]) \n\t" - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ - "ulw %[tp3], 12(%[src]) \n\t" - "ulw %[tp4], 12(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 8(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 12(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - break; - case 32: - /* 8 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 0(%[dst]) \n\t" - "ulw %[tp3], 4(%[src]) \n\t" - "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 8(%[src]) \n\t" - "ulw %[tp2], 8(%[dst]) \n\t" - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ - "ulw %[tp3], 12(%[src]) \n\t" - "ulw %[tp4], 12(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 16(%[src]) \n\t" - "ulw %[tp2], 16(%[dst]) \n\t" - "sw %[tn1], 8(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 12(%[dst]) \n\t" /* store */ - "ulw %[tp3], 20(%[src]) \n\t" - "ulw %[tp4], 20(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 24(%[src]) \n\t" - "ulw %[tp2], 24(%[dst]) \n\t" - "sw %[tn1], 16(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 20(%[dst]) \n\t" /* store */ - "ulw %[tp3], 28(%[src]) \n\t" - "ulw %[tp4], 28(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 24(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 28(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - break; - case 64: - prefetch_load(src + 64); - prefetch_store(dst + 32); - - /* 16 word storage */ - for (y = h; y--;) { - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_load(src + src_stride + 64); - prefetch_store(dst + dst_stride); - prefetch_store(dst + dst_stride + 32); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 0(%[dst]) \n\t" - "ulw %[tp3], 4(%[src]) \n\t" - "ulw %[tp4], 4(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 8(%[src]) \n\t" - "ulw %[tp2], 8(%[dst]) \n\t" - "sw %[tn1], 0(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 4(%[dst]) \n\t" /* store */ - "ulw %[tp3], 12(%[src]) \n\t" - "ulw %[tp4], 12(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 16(%[src]) \n\t" - "ulw %[tp2], 16(%[dst]) \n\t" - "sw %[tn1], 8(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 12(%[dst]) \n\t" /* store */ - "ulw %[tp3], 20(%[src]) \n\t" - "ulw %[tp4], 20(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 24(%[src]) \n\t" - "ulw %[tp2], 24(%[dst]) \n\t" - "sw %[tn1], 16(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 20(%[dst]) \n\t" /* store */ - "ulw %[tp3], 28(%[src]) \n\t" - "ulw %[tp4], 28(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 32(%[src]) \n\t" - "ulw %[tp2], 32(%[dst]) \n\t" - "sw %[tn1], 24(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 28(%[dst]) \n\t" /* store */ - "ulw %[tp3], 36(%[src]) \n\t" - "ulw %[tp4], 36(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 40(%[src]) \n\t" - "ulw %[tp2], 40(%[dst]) \n\t" - "sw %[tn1], 32(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 36(%[dst]) \n\t" /* store */ - "ulw %[tp3], 44(%[src]) \n\t" - "ulw %[tp4], 44(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 48(%[src]) \n\t" - "ulw %[tp2], 48(%[dst]) \n\t" - "sw %[tn1], 40(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 44(%[dst]) \n\t" /* store */ - "ulw %[tp3], 52(%[src]) \n\t" - "ulw %[tp4], 52(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "ulw %[tp1], 56(%[src]) \n\t" - "ulw %[tp2], 56(%[dst]) \n\t" - "sw %[tn1], 48(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 52(%[dst]) \n\t" /* store */ - "ulw %[tp3], 60(%[src]) \n\t" - "ulw %[tp4], 60(%[dst]) \n\t" - "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ - "sw %[tn1], 56(%[dst]) \n\t" /* store */ - "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ - "sw %[tn2], 60(%[dst]) \n\t" /* store */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), - [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) - : [src] "r"(src), [dst] "r"(dst)); - - src += src_stride; - dst += dst_stride; - } - break; - default: - for (y = h; y > 0; --y) { - for (x = 0; x < w; ++x) { - dst[x] = (dst[x] + src[x] + 1) >> 1; - } - - src += src_stride; - dst += dst_stride; - } - break; - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c deleted file mode 100644 index f6534b420..000000000 --- a/third_party/aom/aom_dsp/mips/convolve8_avg_horiz_dspr2.c +++ /dev/null @@ -1,998 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_ports/mem.h" - -#if HAVE_DSPR2 -static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3, Temp4; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2, p3, p4; - uint32_t n1, n2, n3, n4; - uint32_t tn1, tn2; - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "ulw %[tn2], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn2] \n\t" - "balign %[tn1], %[tn2], 3 \n\t" - "balign %[tn2], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ - "preceu.ph.qbr %[n1], %[tp2] \n\t" - "preceu.ph.qbl %[n2], %[tp2] \n\t" - "preceu.ph.qbr %[n3], %[tn2] \n\t" - "preceu.ph.qbl %[n4], %[tn2] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[n1], %[tn1] \n\t" - "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ - "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ - "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" - "extp %[Temp4], $ac2, 31 \n\t" - - "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ - "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ - - /* clamp */ - "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ - "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ - "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ - - "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ - "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ - - "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ - "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), - [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, - uint8_t *dst, int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector4a = 64; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2; - uint32_t p1, p2, p3, p4, n1; - uint32_t tn1, tn2, tn3; - uint32_t st0, st1; - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - prefetch_store(dst + dst_stride); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "ulw %[tn2], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - "lbu %[Temp2], 0(%[dst]) \n\t" - "lbu %[tn3], 2(%[dst]) \n\t" - - /* even 2. pixel */ - "preceu.ph.qbr %[p1], %[tn2] \n\t" - "preceu.ph.qbl %[n1], %[tn2] \n\t" - "ulw %[tn1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[tn1] \n\t" - "lbux %[st1], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" - "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" - "sb %[Temp2], 0(%[dst]) \n\t" - "sb %[tn3], 2(%[dst]) \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "balign %[tn3], %[tn1], 3 \n\t" - "balign %[tn1], %[tn2], 3 \n\t" - "balign %[tn2], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - - "lbux %[st0], %[Temp1](%[cm]) \n\t" - "lbu %[Temp2], 4(%[dst]) \n\t" - "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" - - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sb %[Temp2], 4(%[dst]) \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tn2] \n\t" - "preceu.ph.qbl %[p4], %[tn2] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tp1], 6(%[dst]) \n\t" - - /* odd 2. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn1] \n\t" - "preceu.ph.qbl %[n1], %[tn1] \n\t" - "lbux %[st0], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - "lbu %[tp2], 1(%[dst]) \n\t" - "lbu %[tn2], 3(%[dst]) \n\t" - "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" - - /* odd 3. pixel */ - "lbux %[st1], %[Temp2](%[cm]) \n\t" - "preceu.ph.qbr %[p2], %[tn3] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" - "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - "lbu %[tn3], 5(%[dst]) \n\t" - - /* odd 4. pixel */ - "sb %[tp2], 1(%[dst]) \n\t" - "sb %[tp1], 6(%[dst]) \n\t" - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - "lbu %[tn1], 7(%[dst]) \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" - - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" - - "lbux %[n1], %[Temp1](%[cm]) \n\t" - "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" - - /* store bytes */ - "sb %[tn2], 3(%[dst]) \n\t" - "sb %[tn3], 5(%[dst]) \n\t" - "sb %[tn1], 7(%[dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), - [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), - [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - /* Next row... */ - src += src_stride; - dst += dst_stride; - } -} - -static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h, - int32_t count) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_store(dst_ptr + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ - "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ - "ulw %[qload2], 16(%[src]) \n\t" - "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ - "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ - "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ - "ulw %[qload3], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ - "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ - "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ - "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ - "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ - "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ - "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ - "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ - "ulw %[qload2], 17(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ - "ulw %[qload3], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ - "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ - "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ - - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ - - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ - - "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ - "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ - "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), - [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), - [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), - [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, - int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, - const int16_t *filter_x0, int32_t h) { - int32_t y, c; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2, qload3; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - src = src_ptr; - dst = dst_ptr; - - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - prefetch_store(dst_ptr + dst_stride); - prefetch_store(dst_ptr + dst_stride + 32); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) \n\t" - "ulw %[qload2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ - "mthi $zero, $ac1 \n\t" - "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "ulw %[qload3], 8(%[src]) \n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ - "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "ulw %[qload1], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ - - "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ - "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ - "ulw %[qload2], 16(%[src]) \n\t" - "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ - "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ - "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ - "ulw %[qload3], 20(%[src]) \n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ - "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ - "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ - "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ - - "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ - "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) \n\t" - "ulw %[qload2], 5(%[src]) \n\t" - - "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[qload1] \n\t" - "preceu.ph.qbl %[p2], %[qload1] \n\t" - "preceu.ph.qbr %[p3], %[qload2] \n\t" - "preceu.ph.qbl %[p4], %[qload2] \n\t" - "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ - "ulw %[qload3], 9(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ - "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ - "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ - - "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ - "preceu.ph.qbr %[p1], %[qload3] \n\t" - "preceu.ph.qbl %[p5], %[qload3] \n\t" - "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ - "ulw %[qload1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ - "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ - "preceu.ph.qbr %[p2], %[qload1] \n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ - "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ - "preceu.ph.qbl %[p3], %[qload1] \n\t" - "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ - "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ - "ulw %[qload2], 17(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ - - "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ - "mthi $zero, $ac2 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ - "preceu.ph.qbr %[p4], %[qload2] \n\t" - "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ - "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ - - "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ - "mthi $zero, $ac3 \n\t" - "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ - "preceu.ph.qbl %[p1], %[qload2] \n\t" - "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ - "ulw %[qload3], 21(%[src]) \n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ - "mthi $zero, $ac1 \n\t" - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ - "preceu.ph.qbr %[p5], %[qload3] \n\t" - "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ - "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ - "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ - - "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ - - "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ - - "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ - "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ - - "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ - "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ - - "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ - "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ - - "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ - "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ - "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), - [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), - [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), - [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), - [src] "r"(src)); - - src += 16; - dst += 16; - } - - /* Next row... */ - src_ptr += src_stride; - dst_ptr += dst_stride; - } -} - -void aom_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - assert(x_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - - if (((const int32_t *)filter_x)[0] == 0) { - aom_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); - } else { - uint32_t pos = 38; - - src -= 3; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - prefetch_store(dst); - - switch (w) { - case 4: - convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - case 8: - convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - case 16: - convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, - h, 1); - break; - case 32: - convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, - h, 2); - break; - case 64: - prefetch_load(src + 64); - prefetch_store(dst + 32); - - convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, - h); - break; - default: - aom_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, - h); - break; - } - } -} -#endif diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c index dd4bc821a..af54b4264 100644 --- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c @@ -12,1389 +12,14 @@ #include #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" #if HAVE_DSPR2 -static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint8_t *dst_ptr; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3, Temp4; - uint32_t vector4a = 64; - uint32_t tp1, tp2; - uint32_t p1, p2, p3, p4; - uint32_t tn1, tn2; - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - dst_ptr = dst; - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - - __asm__ __volatile__( - "ulw %[tp1], 0(%[src]) \n\t" - "ulw %[tp2], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp2] \n\t" - "preceu.ph.qbl %[p4], %[tp2] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "ulw %[tn2], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn2] \n\t" - "balign %[tn1], %[tn2], 3 \n\t" - "balign %[tn2], %[tp2], 3 \n\t" - "balign %[tp2], %[tp1], 3 \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* odd 1. pixel */ - "lbux %[tp1], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tn2] \n\t" - "preceu.ph.qbl %[p4], %[tn2] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp2], %[Temp3](%[cm]) \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tn1] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp4], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[tn1], %[Temp2](%[cm]) \n\t" - "lbux %[p2], %[Temp4](%[cm]) \n\t" - - /* store bytes */ - "sb %[tp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[tn1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[tp2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - "sb %[p2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), - [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), - [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), - [dst_stride] "r"(dst_stride)); - - /* Next row... */ - src += src_stride; - dst += 1; - } -} - -static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, - int32_t src_stride, uint8_t *dst, - int32_t dst_stride, - const int16_t *filter_x0, - int32_t h) { - int32_t y; - uint8_t *cm = aom_ff_cropTbl; - uint8_t *dst_ptr; - uint32_t vector4a = 64; - int32_t vector1b, vector2b, vector3b, vector4b; - int32_t Temp1, Temp2, Temp3; - uint32_t tp1, tp2, tp3; - uint32_t p1, p2, p3, p4, n1; - uint8_t *odd_dst; - uint32_t dst_pitch_2 = (dst_stride << 1); - - vector1b = ((const int32_t *)filter_x0)[0]; - vector2b = ((const int32_t *)filter_x0)[1]; - vector3b = ((const int32_t *)filter_x0)[2]; - vector4b = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src + src_stride); - prefetch_load(src + src_stride + 32); - - dst_ptr = dst; - odd_dst = (dst_ptr + dst_stride); - - __asm__ __volatile__( - "ulw %[tp2], 0(%[src]) \n\t" - "ulw %[tp1], 4(%[src]) \n\t" - - /* even 1. pixel */ - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[p2], %[tp2] \n\t" - "preceu.ph.qbr %[p3], %[tp1] \n\t" - "preceu.ph.qbl %[p4], %[tp1] \n\t" - "ulw %[tp3], 8(%[src]) \n\t" - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp1], $ac3, 31 \n\t" - - /* even 2. pixel */ - "preceu.ph.qbr %[p1], %[tp3] \n\t" - "preceu.ph.qbl %[n1], %[tp3] \n\t" - "ulw %[tp2], 12(%[src]) \n\t" - "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - /* even 3. pixel */ - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p2], %[tp2] \n\t" - "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" - "lbux %[tp3], %[Temp3](%[cm]) \n\t" - "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" - "extp %[p3], $ac1, 31 \n\t" - - /* even 4. pixel */ - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sb %[Temp2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - "sb %[tp3], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - - "ulw %[tp1], 1(%[src]) \n\t" - "ulw %[tp3], 5(%[src]) \n\t" - - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp3], $ac2, 31 \n\t" - - "lbux %[tp2], %[p3](%[cm]) \n\t" - - /* odd 1. pixel */ - "mtlo %[vector4a], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "preceu.ph.qbr %[p1], %[tp1] \n\t" - "preceu.ph.qbl %[p2], %[tp1] \n\t" - "preceu.ph.qbr %[p3], %[tp3] \n\t" - "preceu.ph.qbl %[p4], %[tp3] \n\t" - "sb %[tp2], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - "ulw %[tp2], 9(%[src]) \n\t" - - "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 2. pixel */ - "lbux %[tp1], %[Temp3](%[cm]) \n\t" - "mtlo %[vector4a], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "mtlo %[vector4a], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "preceu.ph.qbr %[p1], %[tp2] \n\t" - "preceu.ph.qbl %[n1], %[tp2] \n\t" - "ulw %[Temp1], 13(%[src]) \n\t" - "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" - "sb %[tp1], 0(%[dst_ptr]) \n\t" - "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" - "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" - "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" - "extp %[Temp3], $ac1, 31 \n\t" - - /* odd 3. pixel */ - "lbux %[tp3], %[Temp2](%[cm]) \n\t" - "preceu.ph.qbr %[p2], %[Temp1] \n\t" - "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" - "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" - "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" - "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" - "extp %[Temp2], $ac3, 31 \n\t" - - /* odd 4. pixel */ - "sb %[tp3], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" - "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" - "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" - "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" - "extp %[Temp1], $ac2, 31 \n\t" - - /* clamp */ - "lbux %[p4], %[Temp3](%[cm]) \n\t" - "lbux %[p2], %[Temp2](%[cm]) \n\t" - "lbux %[n1], %[Temp1](%[cm]) \n\t" - - /* store bytes */ - "sb %[p4], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[p2], 0(%[odd_dst]) \n\t" - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" - - "sb %[n1], 0(%[odd_dst]) \n\t" - - : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), - [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst) - : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), - [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), - [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), - [dst_pitch_2] "r"(dst_pitch_2)); - - /* Next row... */ - src += src_stride; - dst += 1; - } -} - -static void convolve_horiz_16_transposed_dspr2( - const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { - int32_t c, y; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - - src = src_ptr; - dst = dst_ptr; - - odd_dst = (dst + dst_stride); - - for (c = 0; c < count; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) " - "\n\t" - "ulw %[qload2], 4(%[src]) " - "\n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 1 */ - "mthi $zero, $ac1 " - "\n\t" - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 2 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "ulw %[qload2], 8(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] " - "\n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 3 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "ulw %[qload1], 12(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] " - "\n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 4 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - " \n\t" - "dpa.w.ph $ac3, %[p3], %[filter12] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] " - "\n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 5 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 16(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p4], %[filter12] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] " - "\n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 6 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p4], %[qload2] " - "\n\t" - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter12] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] " - "\n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 7 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p1], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 20(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] " - "\n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 8 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] " - "\n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 1 */ - "mthi $zero, $ac3 " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] " - "\n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] " - "\n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter56] " - "\n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] " - "\n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) " - "\n\t" - "ulw %[qload2], 5(%[src]) " - "\n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 2 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 9(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p3], %[filter56] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] " - "\n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 3 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] " - "\n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 4 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] " - "\n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 5 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 17(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] " - "\n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 6 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p4], %[qload2] " - "\n\t" - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] " - "\n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 7 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbl %[p1], %[qload2] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 21(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] " - "\n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 8 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p2], %[filter12] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] " - "\n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] " - "\n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), - [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), - [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), - [dst_pitch_2] "r"(dst_pitch_2)); - - src += 16; - dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); - odd_dst = (dst + dst_stride); - } - - /* Next row... */ - src_ptr += src_stride; - - dst_ptr += 1; - } -} - -static void convolve_horiz_64_transposed_dspr2( - const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, - int32_t dst_stride, const int16_t *filter_x0, int32_t h) { - int32_t c, y; - const uint8_t *src; - uint8_t *dst; - uint8_t *cm = aom_ff_cropTbl; - uint32_t vector_64 = 64; - int32_t filter12, filter34, filter56, filter78; - int32_t Temp1, Temp2, Temp3; - uint32_t qload1, qload2; - uint32_t p1, p2, p3, p4, p5; - uint32_t st1, st2, st3; - uint32_t dst_pitch_2 = (dst_stride << 1); - uint8_t *odd_dst; - - filter12 = ((const int32_t *)filter_x0)[0]; - filter34 = ((const int32_t *)filter_x0)[1]; - filter56 = ((const int32_t *)filter_x0)[2]; - filter78 = ((const int32_t *)filter_x0)[3]; - - for (y = h; y--;) { - /* prefetch data to cache memory */ - prefetch_load(src_ptr + src_stride); - prefetch_load(src_ptr + src_stride + 32); - prefetch_load(src_ptr + src_stride + 64); - - src = src_ptr; - dst = dst_ptr; - - odd_dst = (dst + dst_stride); - - for (c = 0; c < 4; c++) { - __asm__ __volatile__( - "ulw %[qload1], 0(%[src]) " - "\n\t" - "ulw %[qload2], 4(%[src]) " - "\n\t" - - /* even 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 1 */ - "mthi $zero, $ac1 " - "\n\t" - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 2 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "ulw %[qload2], 8(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p2], %[filter34] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p3], %[filter56] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac1, %[p4], %[filter78] " - "\n\t" /* even 1 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 1 */ - - /* even 2. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 3 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "ulw %[qload1], 12(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p2], %[filter12] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p3], %[filter34] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p4], %[filter56] " - "\n\t" /* even 1 */ - "dpa.w.ph $ac2, %[p1], %[filter78] " - "\n\t" /* even 1 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 1 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 1 */ - - /* even 3. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 4 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 1 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - " \n\t" - "dpa.w.ph $ac3, %[p3], %[filter12] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p4], %[filter34] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p1], %[filter56] " - "\n\t" /* even 3 */ - "dpa.w.ph $ac3, %[p5], %[filter78] " - "\n\t" /* even 3 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 3 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 1 */ - - /* even 4. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 5 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 2 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 16(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p4], %[filter12] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p1], %[filter34] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p5], %[filter56] " - "\n\t" /* even 4 */ - "dpa.w.ph $ac1, %[p2], %[filter78] " - "\n\t" /* even 4 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 4 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 3 */ - - /* even 5. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* even 6 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p4], %[qload2] " - "\n\t" - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 3 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter12] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p5], %[filter34] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p2], %[filter56] " - "\n\t" /* even 5 */ - "dpa.w.ph $ac2, %[p3], %[filter78] " - "\n\t" /* even 5 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 5 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 4 */ - - /* even 6. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* even 7 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p1], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 4 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 20(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p5], %[filter12] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p2], %[filter34] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p3], %[filter56] " - "\n\t" /* even 6 */ - "dpa.w.ph $ac3, %[p4], %[filter78] " - "\n\t" /* even 6 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* even 6 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 5 */ - - /* even 7. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* even 8 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 5 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p3], %[filter34] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p4], %[filter56] " - "\n\t" /* even 7 */ - "dpa.w.ph $ac1, %[p1], %[filter78] " - "\n\t" /* even 7 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* even 7 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* even 6 */ - - /* even 8. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 1 */ - "mthi $zero, $ac3 " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] " - "\n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p4], %[filter34] " - "\n\t" /* even 8 */ - "sb %[st3], 0(%[dst]) " - "\n\t" /* even 6 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p1], %[filter56] " - "\n\t" /* even 8 */ - "dpa.w.ph $ac2, %[p5], %[filter78] " - "\n\t" /* even 8 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* even 8 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* even 7 */ - - /* ODD pixels */ - "ulw %[qload1], 1(%[src]) " - "\n\t" - "ulw %[qload2], 5(%[src]) " - "\n\t" - - /* odd 1. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 2 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload1] " - "\n\t" - "preceu.ph.qbl %[p2], %[qload1] " - "\n\t" - "preceu.ph.qbr %[p3], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p4], %[qload2] " - "\n\t" - "sb %[st1], 0(%[dst]) " - "\n\t" /* even 7 */ - "addu %[dst], %[dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 9(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p1], %[filter12] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p2], %[filter34] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p3], %[filter56] " - "\n\t" /* odd 1 */ - "dpa.w.ph $ac3, %[p4], %[filter78] " - "\n\t" /* odd 1 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 1 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* even 8 */ - - /* odd 2. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 3 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p1], %[qload2] " - "\n\t" - "preceu.ph.qbl %[p5], %[qload2] " - "\n\t" - "sb %[st2], 0(%[dst]) " - "\n\t" /* even 8 */ - "ulw %[qload1], 13(%[src]) " - "\n\t" - "dpa.w.ph $ac1, %[p2], %[filter12] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p3], %[filter34] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p4], %[filter56] " - "\n\t" /* odd 2 */ - "dpa.w.ph $ac1, %[p1], %[filter78] " - "\n\t" /* odd 2 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 2 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 1 */ - - /* odd 3. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 4 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbr %[p2], %[qload1] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 1 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac2, %[p3], %[filter12] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p4], %[filter34] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p1], %[filter56] " - "\n\t" /* odd 3 */ - "dpa.w.ph $ac2, %[p5], %[filter78] " - "\n\t" /* odd 3 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 3 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 2 */ - - /* odd 4. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 5 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbl %[p3], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 2 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload2], 17(%[src]) " - "\n\t" - "dpa.w.ph $ac3, %[p4], %[filter12] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p1], %[filter34] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p5], %[filter56] " - "\n\t" /* odd 4 */ - "dpa.w.ph $ac3, %[p2], %[filter78] " - "\n\t" /* odd 4 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 4 */ - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 3 */ - - /* odd 5. pixel */ - "mtlo %[vector_64], $ac2 " - "\n\t" /* odd 6 */ - "mthi $zero, $ac2 " - "\n\t" - "preceu.ph.qbr %[p4], %[qload2] " - "\n\t" - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 3 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac1, %[p1], %[filter12] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p5], %[filter34] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p2], %[filter56] " - "\n\t" /* odd 5 */ - "dpa.w.ph $ac1, %[p3], %[filter78] " - "\n\t" /* odd 5 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 5 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 4 */ - - /* odd 6. pixel */ - "mtlo %[vector_64], $ac3 " - "\n\t" /* odd 7 */ - "mthi $zero, $ac3 " - "\n\t" - "preceu.ph.qbl %[p1], %[qload2] " - "\n\t" - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 4 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "ulw %[qload1], 21(%[src]) " - "\n\t" - "dpa.w.ph $ac2, %[p5], %[filter12] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p2], %[filter34] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p3], %[filter56] " - "\n\t" /* odd 6 */ - "dpa.w.ph $ac2, %[p4], %[filter78] " - "\n\t" /* odd 6 */ - "extp %[Temp2], $ac2, 31 " - "\n\t" /* odd 6 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 5 */ - - /* odd 7. pixel */ - "mtlo %[vector_64], $ac1 " - "\n\t" /* odd 8 */ - "mthi $zero, $ac1 " - "\n\t" - "preceu.ph.qbr %[p5], %[qload1] " - "\n\t" - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 5 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - "dpa.w.ph $ac3, %[p2], %[filter12] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p3], %[filter34] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p4], %[filter56] " - "\n\t" /* odd 7 */ - "dpa.w.ph $ac3, %[p1], %[filter78] " - "\n\t" /* odd 7 */ - "extp %[Temp3], $ac3, 31 " - "\n\t" /* odd 7 */ - - /* odd 8. pixel */ - "dpa.w.ph $ac1, %[p3], %[filter12] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p4], %[filter34] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p1], %[filter56] " - "\n\t" /* odd 8 */ - "dpa.w.ph $ac1, %[p5], %[filter78] " - "\n\t" /* odd 8 */ - "extp %[Temp1], $ac1, 31 " - "\n\t" /* odd 8 */ - - "lbux %[st2], %[Temp2](%[cm]) " - "\n\t" /* odd 6 */ - "lbux %[st3], %[Temp3](%[cm]) " - "\n\t" /* odd 7 */ - "lbux %[st1], %[Temp1](%[cm]) " - "\n\t" /* odd 8 */ - - "sb %[st2], 0(%[odd_dst]) " - "\n\t" /* odd 6 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st3], 0(%[odd_dst]) " - "\n\t" /* odd 7 */ - "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " - "\n\t" - - "sb %[st1], 0(%[odd_dst]) " - "\n\t" /* odd 8 */ - - : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), - [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), - [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), - [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), - [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) - : [filter12] "r"(filter12), [filter34] "r"(filter34), - [filter56] "r"(filter56), [filter78] "r"(filter78), - [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), - [dst_pitch_2] "r"(dst_pitch_2)); - - src += 16; - dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); - odd_dst = (dst + dst_stride); - } - - /* Next row... */ - src_ptr += src_stride; - - dst_ptr += 1; - } -} - -void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter, int w, int h) { - int x, y, k; - - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - int sum = 0; - - for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k]; - - dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); - } - - src += src_stride; - dst += 1; - } -} - -void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { - int x, y; - - for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - dst[x * dst_stride] = src[x]; - } - - src += src_stride; - dst += 1; - } -} - -void aom_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); - int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; - uint32_t pos = 38; - - (void)x_step_q4; - - assert(x_step_q4 == 16); - assert(y_step_q4 == 16); - assert(((const int32_t *)filter_x)[1] != 0x800000); - assert(((const int32_t *)filter_y)[1] != 0x800000); - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - if (intermediate_height < h) intermediate_height = h; - - /* copy the src to dst */ - if (filter_x[3] == 0x80) { - copy_horiz_transposed(src - src_stride * 3, src_stride, temp, - intermediate_height, w, intermediate_height); - } else if (((const int32_t *)filter_x)[0] == 0) { - aom_convolve2_dspr2(src - src_stride * 3, src_stride, temp, - intermediate_height, filter_x, w, intermediate_height); - } else { - src -= (src_stride * 3 + 3); - - /* prefetch data to cache memory */ - prefetch_load(src); - prefetch_load(src + 32); - - switch (w) { - case 4: - convolve_horiz_4_transposed_dspr2(src, src_stride, temp, - intermediate_height, filter_x, - intermediate_height); - break; - case 8: - convolve_horiz_8_transposed_dspr2(src, src_stride, temp, - intermediate_height, filter_x, - intermediate_height); - break; - case 16: - case 32: - convolve_horiz_16_transposed_dspr2(src, src_stride, temp, - intermediate_height, filter_x, - intermediate_height, (w / 16)); - break; - case 64: - prefetch_load(src + 32); - convolve_horiz_64_transposed_dspr2(src, src_stride, temp, - intermediate_height, filter_x, - intermediate_height); - break; - default: - convolve_horiz_transposed(src, src_stride, temp, intermediate_height, - filter_x, w, intermediate_height); - break; - } - } - - /* copy the src to dst */ - if (filter_y[3] == 0x80) { - copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w); - } else if (((const int32_t *)filter_y)[0] == 0) { - aom_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride, - filter_y, h, w); - } else { - switch (h) { - case 4: - convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst, - dst_stride, filter_y, w); - break; - case 8: - convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst, - dst_stride, filter_y, w); - break; - case 16: - case 32: - convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst, - dst_stride, filter_y, w, (h / 16)); - break; - case 64: - convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst, - dst_stride, filter_y, w); - break; - default: - convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride, - filter_y, h, w); - break; - } - } -} - void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c index c60557617..f9c6879ab 100644 --- a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c @@ -12,7 +12,8 @@ #include #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c index d8a90b6ab..201e66427 100644 --- a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c @@ -12,7 +12,8 @@ #include #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/convolve_common_dspr2.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h index f8fd9e2b6..e7b8d531b 100644 --- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h +++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h @@ -14,7 +14,8 @@ #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #include "aom_dsp/mips/common_dspr2.h" @@ -29,18 +30,6 @@ void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h); -void aom_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h); - -void aom_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); - void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter, int w, int h); diff --git a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c b/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c deleted file mode 100644 index 43dce8ba6..000000000 --- a/third_party/aom/aom_dsp/mips/fwd_dct32x32_msa.c +++ /dev/null @@ -1,928 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/fwd_txfm_msa.h" - -static void fdct8x32_1d_column_load_butterfly(const int16_t *input, - int32_t src_stride, - int16_t *temp_buff) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 step0, step1, step2, step3; - v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; - v8i16 step0_1, step1_1, step2_1, step3_1; - - /* 1st and 2nd set */ - LD_SH4(input, src_stride, in0, in1, in2, in3); - LD_SH4(input + (28 * src_stride), src_stride, in4, in5, in6, in7); - LD_SH4(input + (4 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); - LD_SH4(input + (24 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); - SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, - step3, in4, in5, in6, in7); - BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, - step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); - ST_SH4(step0, step1, step2, step3, temp_buff, 8); - ST_SH4(in4, in5, in6, in7, temp_buff + (28 * 8), 8); - ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (4 * 8), 8); - ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (24 * 8), 8); - - /* 3rd and 4th set */ - LD_SH4(input + (8 * src_stride), src_stride, in0, in1, in2, in3); - LD_SH4(input + (20 * src_stride), src_stride, in4, in5, in6, in7); - LD_SH4(input + (12 * src_stride), src_stride, in0_1, in1_1, in2_1, in3_1); - LD_SH4(input + (16 * src_stride), src_stride, in4_1, in5_1, in6_1, in7_1); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - SLLI_4V(in0_1, in1_1, in2_1, in3_1, 2); - SLLI_4V(in4_1, in5_1, in6_1, in7_1, 2); - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2, - step3, in4, in5, in6, in7); - BUTTERFLY_8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, step0_1, - step1_1, step2_1, step3_1, in4_1, in5_1, in6_1, in7_1); - ST_SH4(step0, step1, step2, step3, temp_buff + (8 * 8), 8); - ST_SH4(in4, in5, in6, in7, temp_buff + (20 * 8), 8); - ST_SH4(step0_1, step1_1, step2_1, step3_1, temp_buff + (12 * 8), 8); - ST_SH4(in4_1, in5_1, in6_1, in7_1, temp_buff + (15 * 8) + 8, 8); -} - -static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8i16 temp0, temp1; - - /* fdct even */ - LD_SH4(input, 8, in0, in1, in2, in3); - LD_SH4(input + 96, 8, in12, in13, in14, in15); - BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, - vec3, in12, in13, in14, in15); - LD_SH4(input + 32, 8, in4, in5, in6, in7); - LD_SH4(input + 64, 8, in8, in9, in10, in11); - BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, - in8, in9, in10, in11); - - /* Stage 3 */ - ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); - BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); - DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp); - ST_SH(temp1, temp + 512); - - DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 256); - ST_SH(temp1, temp + 768); - - SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 128); - ST_SH(temp1, temp + 896); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 640); - ST_SH(temp1, temp + 384); - - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 64); - ST_SH(temp1, temp + 960); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 576); - ST_SH(temp1, temp + 448); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 320); - ST_SH(temp1, temp + 704); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); - FDCT32_POSTPROC_2V_POS_H(temp0, temp1); - ST_SH(temp0, temp + 192); - ST_SH(temp1, temp + 832); -} - -static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { - v8i16 in16, in17, in18, in19, in20, in21, in22, in23; - v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; - - in20 = LD_SH(input + 32); - in21 = LD_SH(input + 40); - in26 = LD_SH(input + 80); - in27 = LD_SH(input + 88); - - DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); - DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); - - in18 = LD_SH(input + 16); - in19 = LD_SH(input + 24); - in28 = LD_SH(input + 96); - in29 = LD_SH(input + 104); - - vec4 = in19 - in20; - ST_SH(vec4, input + 32); - vec4 = in18 - in21; - ST_SH(vec4, input + 40); - vec4 = in29 - in26; - ST_SH(vec4, input + 80); - vec4 = in28 - in27; - ST_SH(vec4, input + 88); - - in21 = in18 + in21; - in20 = in19 + in20; - in27 = in28 + in27; - in26 = in29 + in26; - - LD_SH4(input + 48, 8, in22, in23, in24, in25); - DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); - DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); - - in16 = LD_SH(input); - in17 = LD_SH(input + 8); - in30 = LD_SH(input + 112); - in31 = LD_SH(input + 120); - - vec4 = in17 - in22; - ST_SH(vec4, input + 16); - vec4 = in16 - in23; - ST_SH(vec4, input + 24); - vec4 = in31 - in24; - ST_SH(vec4, input + 96); - vec4 = in30 - in25; - ST_SH(vec4, input + 104); - - ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); - DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); - DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); - ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); - DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); - ADD2(in27, in26, in25, in24, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr); - ST_SH(vec4, temp_ptr + 960); - - SUB2(in27, in26, in25, in24, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 448); - ST_SH(vec4, temp_ptr + 512); - - SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); - DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); - SUB2(in26, in27, in24, in25, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec4, temp_ptr + 704); - ST_SH(vec5, temp_ptr + 256); - - ADD2(in26, in27, in24, in25, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec4, temp_ptr + 192); - ST_SH(vec5, temp_ptr + 768); - - LD_SH4(input + 16, 8, in22, in23, in20, in21); - LD_SH4(input + 80, 8, in26, in27, in24, in25); - in16 = in20; - in17 = in21; - DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); - DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); - SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); - DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); - ADD2(in28, in29, in31, in30, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 832); - ST_SH(vec4, temp_ptr + 128); - - SUB2(in28, in29, in31, in30, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 320); - ST_SH(vec4, temp_ptr + 640); - ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); - DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); - SUB2(in29, in28, in30, in31, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 576); - ST_SH(vec4, temp_ptr + 384); - - ADD2(in29, in28, in30, in31, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); - FDCT32_POSTPROC_2V_POS_H(vec5, vec4); - ST_SH(vec5, temp_ptr + 64); - ST_SH(vec4, temp_ptr + 896); -} - -static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride, - int16_t *tmp_buf, int16_t *tmp_buf_big) { - fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf); - fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big); - fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32)); -} - -static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff, - int16_t *output) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 step0, step1, step2, step3, step4, step5, step6, step7; - - LD_SH8(temp_buff, 32, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp_buff + 24, 32, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, - step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, output, 8); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 24 * 8), 8); - - /* 2nd set */ - LD_SH8(temp_buff + 8, 32, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp_buff + 16, 32, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, step0, step1, step2, step3, step4, step5, - step6, step7, in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(step0, step1, step2, step3, step4, step5, step6, step7, - (output + 8 * 8), 8); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, (output + 16 * 8), 8); -} - -static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, - int16_t *out) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; - v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; - v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; - - /* fdct32 even */ - /* stage 2 */ - LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, - vec7, in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); - - /* Stage 3 */ - UNPCK_SH_SW(vec0, vec0_l, vec0_r); - UNPCK_SH_SW(vec1, vec1_l, vec1_r); - UNPCK_SH_SW(vec2, vec2_l, vec2_r); - UNPCK_SH_SW(vec3, vec3_l, vec3_r); - UNPCK_SH_SW(vec4, vec4_l, vec4_r); - UNPCK_SH_SW(vec5, vec5_l, vec5_r); - UNPCK_SH_SW(vec6, vec6_l, vec6_r); - UNPCK_SH_SW(vec7, vec7_l, vec7_r); - ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, - tmp1_w, tmp2_w, tmp3_w); - BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); - ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, - vec1_r, vec2_r, vec3_r); - - tmp3_w = vec0_r + vec3_r; - vec0_r = vec0_r - vec3_r; - vec3_r = vec1_r + vec2_r; - vec1_r = vec1_r - vec2_r; - - DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, - vec4_r, tmp3_w, vec6_r, vec3_r); - FDCT32_POSTPROC_NEG_W(vec4_r); - FDCT32_POSTPROC_NEG_W(tmp3_w); - FDCT32_POSTPROC_NEG_W(vec6_r); - FDCT32_POSTPROC_NEG_W(vec3_r); - PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); - ST_SH2(vec5, vec4, out, 8); - - DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, - vec4_r, tmp3_w, vec6_r, vec3_r); - FDCT32_POSTPROC_NEG_W(vec4_r); - FDCT32_POSTPROC_NEG_W(tmp3_w); - FDCT32_POSTPROC_NEG_W(vec6_r); - FDCT32_POSTPROC_NEG_W(vec3_r); - PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); - ST_SH2(vec5, vec4, out + 16, 8); - - LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); - SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 32); - ST_SH(in5, out + 56); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 40); - ST_SH(in5, out + 48); - - LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 64); - ST_SH(in5, out + 120); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 72); - ST_SH(in5, out + 112); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 80); - ST_SH(in5, out + 104); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); - FDCT_POSTPROC_2V_NEG_H(in4, in5); - ST_SH(in4, out + 96); - ST_SH(in5, out + 88); -} - -static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; - - /* fdct32 even */ - /* stage 2 */ - LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, - vec7, in8, in9, in10, in11, in12, in13, in14, in15); - - /* Stage 3 */ - ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); - BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); - DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out); - ST_SH(temp1, out + 8); - - DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 16); - ST_SH(temp1, out + 24); - - SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 32); - ST_SH(temp1, out + 56); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 40); - ST_SH(temp1, out + 48); - - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 64); - ST_SH(temp1, out + 120); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 72); - ST_SH(temp1, out + 112); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5) - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 80); - ST_SH(temp1, out + 104); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); - FDCT_POSTPROC_2V_NEG_H(temp0, temp1); - ST_SH(temp0, out + 96); - ST_SH(temp1, out + 88); -} - -static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr, - int16_t *out) { - v8i16 in16, in17, in18, in19, in20, in21, in22, in23; - v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; - - in20 = LD_SH(temp + 32); - in21 = LD_SH(temp + 40); - in26 = LD_SH(temp + 80); - in27 = LD_SH(temp + 88); - - DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); - DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); - - in18 = LD_SH(temp + 16); - in19 = LD_SH(temp + 24); - in28 = LD_SH(temp + 96); - in29 = LD_SH(temp + 104); - - vec4 = in19 - in20; - ST_SH(vec4, interm_ptr + 32); - vec4 = in18 - in21; - ST_SH(vec4, interm_ptr + 88); - vec4 = in28 - in27; - ST_SH(vec4, interm_ptr + 56); - vec4 = in29 - in26; - ST_SH(vec4, interm_ptr + 64); - - ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); - - in22 = LD_SH(temp + 48); - in23 = LD_SH(temp + 56); - in24 = LD_SH(temp + 64); - in25 = LD_SH(temp + 72); - - DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); - DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); - - in16 = LD_SH(temp); - in17 = LD_SH(temp + 8); - in30 = LD_SH(temp + 112); - in31 = LD_SH(temp + 120); - - vec4 = in17 - in22; - ST_SH(vec4, interm_ptr + 40); - vec4 = in30 - in25; - ST_SH(vec4, interm_ptr + 48); - vec4 = in31 - in24; - ST_SH(vec4, interm_ptr + 72); - vec4 = in16 - in23; - ST_SH(vec4, interm_ptr + 80); - - ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); - DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); - DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); - - ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); - DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); - ADD2(in27, in26, in25, in24, in23, in20); - - DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out); - ST_SH(vec4, out + 120); - - SUB2(in27, in26, in25, in24, in22, in21); - - DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 112); - ST_SH(vec4, out + 8); - - SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); - DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); - SUB2(in26, in27, in24, in25, in23, in20); - - DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec4, out + 16); - ST_SH(vec5, out + 104); - - ADD2(in26, in27, in24, in25, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec4, out + 24); - ST_SH(vec5, out + 96); - - in20 = LD_SH(interm_ptr + 32); - in21 = LD_SH(interm_ptr + 88); - in27 = LD_SH(interm_ptr + 56); - in26 = LD_SH(interm_ptr + 64); - - in16 = in20; - in17 = in21; - DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); - DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); - - in22 = LD_SH(interm_ptr + 40); - in25 = LD_SH(interm_ptr + 48); - in24 = LD_SH(interm_ptr + 72); - in23 = LD_SH(interm_ptr + 80); - - SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); - DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); - ADD2(in28, in29, in31, in30, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 32); - ST_SH(vec4, out + 88); - - SUB2(in28, in29, in31, in30, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 40); - ST_SH(vec4, out + 80); - - ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); - DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); - SUB2(in29, in28, in30, in31, in16, in19); - - DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec5, out + 72); - ST_SH(vec4, out + 48); - - ADD2(in29, in28, in30, in31, in17, in18); - - DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); - FDCT_POSTPROC_2V_NEG_H(vec5, vec4); - ST_SH(vec4, out + 56); - ST_SH(vec5, out + 64); -} - -static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1; - - /* 1st set */ - in0 = LD_SH(temp); - in4 = LD_SH(temp + 32); - in2 = LD_SH(temp + 64); - in6 = LD_SH(temp + 96); - in1 = LD_SH(temp + 128); - in7 = LD_SH(temp + 152); - in3 = LD_SH(temp + 192); - in5 = LD_SH(temp + 216); - - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - /* 2nd set */ - in0_1 = LD_SH(temp + 16); - in1_1 = LD_SH(temp + 232); - in2_1 = LD_SH(temp + 80); - in3_1 = LD_SH(temp + 168); - in4_1 = LD_SH(temp + 48); - in5_1 = LD_SH(temp + 176); - in6_1 = LD_SH(temp + 112); - in7_1 = LD_SH(temp + 240); - - ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 32); - TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); - - /* 3rd set */ - in0 = LD_SH(temp + 8); - in1 = LD_SH(temp + 136); - in2 = LD_SH(temp + 72); - in3 = LD_SH(temp + 200); - in4 = LD_SH(temp + 40); - in5 = LD_SH(temp + 208); - in6 = LD_SH(temp + 104); - in7 = LD_SH(temp + 144); - - ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 8, - 32); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output + 16, 32); - - /* 4th set */ - in0_1 = LD_SH(temp + 24); - in1_1 = LD_SH(temp + 224); - in2_1 = LD_SH(temp + 88); - in3_1 = LD_SH(temp + 160); - in4_1 = LD_SH(temp + 56); - in5_1 = LD_SH(temp + 184); - in6_1 = LD_SH(temp + 120); - in7_1 = LD_SH(temp + 248); - - TRANSPOSE8x8_SH_SH(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, - in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1); - ST_SH8(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1, output + 24, - 32); -} - -static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) { - fdct8x32_1d_row_load_butterfly(temp, temp_buf); - fdct8x32_1d_row_even(temp_buf, temp_buf); - fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128); - fdct8x32_1d_row_transpose_store(temp_buf, output); -} - -static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf, - int16_t *output) { - fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); - fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf); - fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128); - fdct8x32_1d_row_transpose_store(tmp_buf, output); -} - -void aom_fdct32x32_msa(const int16_t *input, int16_t *output, - int32_t src_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); - DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); - - /* column transform */ - for (i = 0; i < 4; ++i) { - fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, - tmp_buf_big + (8 * i)); - } - - /* row transform */ - fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); - - /* row transform */ - for (i = 1; i < 4; ++i) { - fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); - } -} - -static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; - - /* fdct32 even */ - /* stage 2 */ - LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); - - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, - vec7, in8, in9, in10, in11, in12, in13, in14, in15); - FDCT_POSTPROC_2V_NEG_H(vec0, vec1); - FDCT_POSTPROC_2V_NEG_H(vec2, vec3); - FDCT_POSTPROC_2V_NEG_H(vec4, vec5); - FDCT_POSTPROC_2V_NEG_H(vec6, vec7); - FDCT_POSTPROC_2V_NEG_H(in8, in9); - FDCT_POSTPROC_2V_NEG_H(in10, in11); - FDCT_POSTPROC_2V_NEG_H(in12, in13); - FDCT_POSTPROC_2V_NEG_H(in14, in15); - - /* Stage 3 */ - ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); - - temp0 = in0 + in3; - in0 = in0 - in3; - in3 = in1 + in2; - in1 = in1 - in2; - - DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); - ST_SH(temp0, out); - ST_SH(temp1, out + 8); - - DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); - ST_SH(temp0, out + 16); - ST_SH(temp1, out + 24); - - SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); - DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); - ADD2(vec4, vec5, vec7, vec6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); - ST_SH(temp0, out + 32); - ST_SH(temp1, out + 56); - - SUB2(vec4, vec5, vec7, vec6, vec4, vec7); - DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); - ST_SH(temp0, out + 40); - ST_SH(temp1, out + 48); - - DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); - DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); - ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); - DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); - ADD2(in0, in1, in2, in3, vec0, vec7); - DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); - ST_SH(temp0, out + 64); - ST_SH(temp1, out + 120); - - SUB2(in0, in1, in2, in3, in0, in2); - DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); - ST_SH(temp0, out + 72); - ST_SH(temp1, out + 112); - - SUB2(in9, vec2, in14, vec5, vec2, vec5); - DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); - SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); - DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); - ST_SH(temp0, out + 80); - ST_SH(temp1, out + 104); - - ADD2(in3, in2, in0, in1, vec3, vec4); - DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); - ST_SH(temp0, out + 96); - ST_SH(temp1, out + 88); -} - -static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, - int16_t *out) { - v8i16 in16, in17, in18, in19, in20, in21, in22, in23; - v8i16 in24, in25, in26, in27, in28, in29, in30, in31; - v8i16 vec4, vec5; - - in20 = LD_SH(temp + 32); - in21 = LD_SH(temp + 40); - in26 = LD_SH(temp + 80); - in27 = LD_SH(temp + 88); - - DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); - DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); - - FDCT_POSTPROC_2V_NEG_H(in20, in21); - FDCT_POSTPROC_2V_NEG_H(in26, in27); - - in18 = LD_SH(temp + 16); - in19 = LD_SH(temp + 24); - in28 = LD_SH(temp + 96); - in29 = LD_SH(temp + 104); - - FDCT_POSTPROC_2V_NEG_H(in18, in19); - FDCT_POSTPROC_2V_NEG_H(in28, in29); - - vec4 = in19 - in20; - ST_SH(vec4, interm_ptr + 32); - vec4 = in18 - in21; - ST_SH(vec4, interm_ptr + 88); - vec4 = in29 - in26; - ST_SH(vec4, interm_ptr + 64); - vec4 = in28 - in27; - ST_SH(vec4, interm_ptr + 56); - - ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); - - in22 = LD_SH(temp + 48); - in23 = LD_SH(temp + 56); - in24 = LD_SH(temp + 64); - in25 = LD_SH(temp + 72); - - DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); - DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); - FDCT_POSTPROC_2V_NEG_H(in22, in23); - FDCT_POSTPROC_2V_NEG_H(in24, in25); - - in16 = LD_SH(temp); - in17 = LD_SH(temp + 8); - in30 = LD_SH(temp + 112); - in31 = LD_SH(temp + 120); - - FDCT_POSTPROC_2V_NEG_H(in16, in17); - FDCT_POSTPROC_2V_NEG_H(in30, in31); - - vec4 = in17 - in22; - ST_SH(vec4, interm_ptr + 40); - vec4 = in30 - in25; - ST_SH(vec4, interm_ptr + 48); - vec4 = in31 - in24; - ST_SH(vec4, interm_ptr + 72); - vec4 = in16 - in23; - ST_SH(vec4, interm_ptr + 80); - - ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); - DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); - DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); - ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); - DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); - ADD2(in27, in26, in25, in24, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); - ST_SH(vec5, out); - ST_SH(vec4, out + 120); - - SUB2(in27, in26, in25, in24, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); - ST_SH(vec5, out + 112); - ST_SH(vec4, out + 8); - - SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); - DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); - SUB2(in26, in27, in24, in25, in23, in20); - DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); - ST_SH(vec4, out + 16); - ST_SH(vec5, out + 104); - - ADD2(in26, in27, in24, in25, in22, in21); - DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); - ST_SH(vec4, out + 24); - ST_SH(vec5, out + 96); - - in20 = LD_SH(interm_ptr + 32); - in21 = LD_SH(interm_ptr + 88); - in27 = LD_SH(interm_ptr + 56); - in26 = LD_SH(interm_ptr + 64); - - in16 = in20; - in17 = in21; - DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); - DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); - - in22 = LD_SH(interm_ptr + 40); - in25 = LD_SH(interm_ptr + 48); - in24 = LD_SH(interm_ptr + 72); - in23 = LD_SH(interm_ptr + 80); - - SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); - DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); - in16 = in28 + in29; - in19 = in31 + in30; - DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); - ST_SH(vec5, out + 32); - ST_SH(vec4, out + 88); - - SUB2(in28, in29, in31, in30, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); - ST_SH(vec5, out + 40); - ST_SH(vec4, out + 80); - - ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); - DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); - SUB2(in29, in28, in30, in31, in16, in19); - DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); - ST_SH(vec5, out + 72); - ST_SH(vec4, out + 48); - - ADD2(in29, in28, in30, in31, in17, in18); - DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); - ST_SH(vec4, out + 56); - ST_SH(vec5, out + 64); -} - -static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf, - int16_t *output) { - fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf); - fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf); - fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128)); - fdct8x32_1d_row_transpose_store(tmp_buf, output); -} - -void aom_fdct32x32_rd_msa(const int16_t *input, int16_t *out, - int32_t src_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); - DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); - - /* column transform */ - for (i = 0; i < 4; ++i) { - fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], - &tmp_buf_big[0] + (8 * i)); - } - - /* row transform */ - for (i = 0; i < 4; ++i) { - fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], - out + (8 * i * 32)); - } -} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c deleted file mode 100644 index 7a285b7b8..000000000 --- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/fwd_txfm_msa.h" - -void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, - int32_t src_stride) { - v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; - v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; - v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; - v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, - -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; - v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, - cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; - v8i16 coeff2 = { - -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 - }; - - LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, - in10, in11, in12, in13, in14, in15); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - SLLI_4V(in8, in9, in10, in11, 2); - SLLI_4V(in12, in13, in14, in15, 2); - ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); - ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); - FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, - tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); - ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); - SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); - SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); - - tmp_ptr += 16; - - /* stp 1 */ - ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); - ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); - - cnst4 = __msa_splati_h(coeff, 0); - stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); - - cnst5 = __msa_splati_h(coeff, 1); - cnst5 = __msa_ilvev_h(cnst5, cnst4); - stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); - stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); - stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); - - /* stp2 */ - BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); - BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); - ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); - ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); - SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); - cnst0 = __msa_ilvev_h(cnst0, cnst1); - stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); - - cnst0 = __msa_splati_h(coeff, 4); - cnst1 = __msa_ilvev_h(cnst1, cnst0); - stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); - - BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); - ILVRL_H2_SH(in15, in8, vec1, vec0); - SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); - cnst0 = __msa_ilvev_h(cnst0, cnst1); - - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr); - - cnst0 = __msa_splati_h(coeff2, 0); - cnst0 = __msa_ilvev_h(cnst1, cnst0); - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr + 224); - - ILVRL_H2_SH(in14, in9, vec1, vec0); - SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); - cnst1 = __msa_ilvev_h(cnst1, cnst0); - - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); - ST_SH(in8, tmp_ptr + 128); - - cnst1 = __msa_splati_h(coeff2, 2); - cnst0 = __msa_ilvev_h(cnst0, cnst1); - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr + 96); - - SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); - cnst1 = __msa_ilvev_h(cnst1, cnst0); - - stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); - - cnst1 = __msa_splati_h(coeff, 3); - cnst1 = __msa_ilvev_h(cnst0, cnst1); - stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); - - /* stp4 */ - ADD2(stp34, stp25, stp33, stp22, in13, in10); - - ILVRL_H2_SH(in13, in10, vec1, vec0); - SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); - cnst0 = __msa_ilvev_h(cnst0, cnst1); - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr + 64); - - cnst0 = __msa_splati_h(coeff2, 1); - cnst0 = __msa_ilvev_h(cnst1, cnst0); - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr + 160); - - SUB2(stp34, stp25, stp33, stp22, in12, in11); - ILVRL_H2_SH(in12, in11, vec1, vec0); - SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); - cnst1 = __msa_ilvev_h(cnst1, cnst0); - - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); - ST_SH(in8, tmp_ptr + 192); - - cnst1 = __msa_splati_h(coeff2, 3); - cnst0 = __msa_ilvev_h(cnst0, cnst1); - in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); - ST_SH(in8, tmp_ptr + 32); -} - -void fdct16x8_1d_row(int16_t *input, int16_t *output) { - v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 in8, in9, in10, in11, in12, in13, in14, in15; - - LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); - LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, - in10, in11, in12, in13, in14, in15); - ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); - ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); - ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); - ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); - SRA_4V(in0, in1, in2, in3, 2); - SRA_4V(in4, in5, in6, in7, 2); - SRA_4V(in8, in9, in10, in11, 2); - SRA_4V(in12, in13, in14, in15, 2); - BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, - in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, - tmp7, in8, in9, in10, in11, in12, in13, in14, in15); - ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); - FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, - tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); - LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); - FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, - in4, in5, in6, in7); - TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, - tmp1, in1, tmp2, in2, tmp3, in3); - ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); - TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, - tmp5, in5, tmp6, in6, tmp7, in7); - ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); -} - -void aom_fdct4x4_msa(const int16_t *input, int16_t *output, - int32_t src_stride) { - v8i16 in0, in1, in2, in3; - - LD_SH4(input, src_stride, in0, in1, in2, in3); - - /* fdct4 pre-process */ - { - v8i16 vec, mask; - v16i8 zero = { 0 }; - v16i8 one = __msa_ldi_b(1); - - mask = (v8i16)__msa_sldi_b(zero, one, 15); - SLLI_4V(in0, in1, in2, in3, 4); - vec = __msa_ceqi_h(in0, 0); - vec = vec ^ 255; - vec = mask & vec; - in0 += vec; - } - - AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - AOM_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); - SRA_4V(in0, in1, in2, in3, 2); - PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); - ST_SH2(in0, in2, output, 8); -} - -void aom_fdct8x8_msa(const int16_t *input, int16_t *output, - int32_t src_stride) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - - LD_SH8(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7); - SLLI_4V(in0, in1, in2, in3, 2); - SLLI_4V(in4, in5, in6, in7, 2); - AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, - in5, in6, in7); - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7); - ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8); -} - -void aom_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { - out[0] = LD_HADD(input, stride); - out[1] = 0; -} - -void aom_fdct16x16_msa(const int16_t *input, int16_t *output, - int32_t src_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); - - /* column transform */ - for (i = 0; i < 2; ++i) { - fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); - } - - /* row transform */ - for (i = 0; i < 2; ++i) { - fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); - } -} diff --git a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h b/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h deleted file mode 100644 index ada25dffd..000000000 --- a/third_party/aom/aom_dsp/mips/fwd_txfm_msa.h +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_MIPS_FWD_TXFM_MSA_H_ -#define AOM_DSP_MIPS_FWD_TXFM_MSA_H_ - -#include "aom_dsp/mips/txfm_macros_msa.h" -#include "aom_dsp/txfm_common.h" - -#define LD_HADD(psrc, stride) \ - ({ \ - v8i16 in0_m, in1_m, in2_m, in3_m, in4_m, in5_m, in6_m, in7_m; \ - v4i32 vec_w_m; \ - \ - LD_SH4((psrc), stride, in0_m, in1_m, in2_m, in3_m); \ - ADD2(in0_m, in1_m, in2_m, in3_m, in0_m, in2_m); \ - LD_SH4(((psrc) + 4 * stride), stride, in4_m, in5_m, in6_m, in7_m); \ - ADD4(in4_m, in5_m, in6_m, in7_m, in0_m, in2_m, in4_m, in6_m, in4_m, in6_m, \ - in0_m, in4_m); \ - in0_m += in4_m; \ - \ - vec_w_m = __msa_hadd_s_w(in0_m, in0_m); \ - HADD_SW_S32(vec_w_m); \ - }) - -#define AOM_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m; \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m; \ - v4i32 vec4_m, vec5_m, vec6_m, vec7_m; \ - v8i16 coeff_m = { \ - cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, 0, 0, 0 \ - }; \ - \ - BUTTERFLY_4(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m); \ - ILVR_H2_SH(vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \ - SPLATI_H2_SH(coeff_m, 0, 1, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - vec5_m = __msa_dotp_s_w(vec0_m, cnst1_m); \ - \ - SPLATI_H2_SH(coeff_m, 4, 3, cnst2_m, cnst3_m); \ - cnst2_m = __msa_ilvev_h(cnst3_m, cnst2_m); \ - vec7_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ - \ - vec4_m = __msa_dotp_s_w(vec0_m, cnst0_m); \ - cnst2_m = __msa_splati_h(coeff_m, 2); \ - cnst2_m = __msa_ilvev_h(cnst2_m, cnst3_m); \ - vec6_m = __msa_dotp_s_w(vec2_m, cnst2_m); \ - \ - SRARI_W4_SW(vec4_m, vec5_m, vec6_m, vec7_m, DCT_CONST_BITS); \ - PCKEV_H4_SH(vec4_m, vec4_m, vec5_m, vec5_m, vec6_m, vec6_m, vec7_m, \ - vec7_m, out0, out2, out1, out3); \ - } - -#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7) \ - { \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - \ - SRLI_H4_SH(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m, 15); \ - SRLI_H4_SH(in4, in5, in6, in7, vec4_m, vec5_m, vec6_m, vec7_m, 15); \ - AVE_SH4_SH(vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m, in3, in0, in1, \ - in2, in3); \ - AVE_SH4_SH(vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m, in7, in4, in5, \ - in6, in7); \ - } - -#define AOM_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ - out3, out4, out5, out6, out7) \ - { \ - v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m; \ - v8i16 s7_m, x0_m, x1_m, x2_m, x3_m; \ - v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ - cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ - \ - /* FDCT stage1 */ \ - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ - s3_m, s4_m, s5_m, s6_m, s7_m); \ - BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ - ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ - ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ - SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x1_m, x0_m); \ - out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ - \ - SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ - x2_m = -x2_m; \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ - \ - out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - x2_m = __msa_splati_h(coeff_m, 2); \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ - \ - /* stage2 */ \ - ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ - \ - s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ - \ - /* stage3 */ \ - BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ - \ - /* stage4 */ \ - ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ - ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ - \ - SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x0_m, x1_m); \ - out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ - \ - SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ - \ - x1_m = __msa_splati_h(coeff_m, 5); \ - x0_m = -x0_m; \ - x0_m = __msa_ilvev_h(x1_m, x0_m); \ - out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ - \ - x2_m = __msa_splati_h(coeff_m, 6); \ - x3_m = -x3_m; \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ - } - -#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - v8i16 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ - v8i16 x0_m, x1_m, x2_m, x3_m; \ - v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ - cospi_4_64, cospi_28_64, cospi_12_64, cospi_20_64 }; \ - \ - /* FDCT stage1 */ \ - BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, s2_m, \ - s3_m, s4_m, s5_m, s6_m, s7_m); \ - BUTTERFLY_4(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m); \ - ILVL_H2_SH(x1_m, x0_m, x3_m, x2_m, s0_m, s2_m); \ - ILVR_H2_SH(x1_m, x0_m, x3_m, x2_m, s1_m, s3_m); \ - SPLATI_H2_SH(coeff_m, 0, 1, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x1_m, x0_m); \ - out4 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ - \ - SPLATI_H2_SH(coeff_m, 2, 3, x2_m, x3_m); \ - x2_m = -x2_m; \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out6 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ - \ - out0 = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - x2_m = __msa_splati_h(coeff_m, 2); \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out2 = DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m); \ - \ - /* stage2 */ \ - ILVRL_H2_SH(s5_m, s6_m, s1_m, s0_m); \ - \ - s6_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m); \ - s5_m = DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m); \ - \ - /* stage3 */ \ - BUTTERFLY_4(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m); \ - \ - /* stage4 */ \ - ILVL_H2_SH(x3_m, x0_m, x2_m, x1_m, s4_m, s6_m); \ - ILVR_H2_SH(x3_m, x0_m, x2_m, x1_m, s5_m, s7_m); \ - \ - SPLATI_H2_SH(coeff_m, 4, 5, x0_m, x1_m); \ - x1_m = __msa_ilvev_h(x0_m, x1_m); \ - out1 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m); \ - \ - SPLATI_H2_SH(coeff_m, 6, 7, x2_m, x3_m); \ - x2_m = __msa_ilvev_h(x3_m, x2_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ - \ - x1_m = __msa_splati_h(coeff_m, 5); \ - x0_m = -x0_m; \ - x0_m = __msa_ilvev_h(x1_m, x0_m); \ - out7 = DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m); \ - \ - x2_m = __msa_splati_h(coeff_m, 6); \ - x3_m = -x3_m; \ - x2_m = __msa_ilvev_h(x2_m, x3_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m); \ - } - -#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6, \ - input7, out1, out3, out5, out7, out9, out11, out13, \ - out15) \ - { \ - v8i16 stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m; \ - v8i16 stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m; \ - v8i16 stp36_m, stp37_m, vec0_m, vec1_m; \ - v8i16 vec2_m, vec3_m, vec4_m, vec5_m, vec6_m; \ - v8i16 cnst0_m, cnst1_m, cnst4_m, cnst5_m; \ - v8i16 coeff_m = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, \ - -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; \ - v8i16 coeff1_m = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, \ - cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; \ - v8i16 coeff2_m = { \ - -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 \ - }; \ - \ - /* stp 1 */ \ - ILVL_H2_SH(input2, input5, input3, input4, vec2_m, vec4_m); \ - ILVR_H2_SH(input2, input5, input3, input4, vec3_m, vec5_m); \ - \ - cnst4_m = __msa_splati_h(coeff_m, 0); \ - stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m); \ - \ - cnst5_m = __msa_splati_h(coeff_m, 1); \ - cnst5_m = __msa_ilvev_h(cnst5_m, cnst4_m); \ - stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m); \ - stp24_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m); \ - stp23_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m); \ - \ - /* stp2 */ \ - BUTTERFLY_4(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m, stp32_m, \ - stp33_m); \ - BUTTERFLY_4(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m, stp35_m, \ - stp34_m); \ - \ - ILVL_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec2_m, vec4_m); \ - ILVR_H2_SH(stp36_m, stp31_m, stp35_m, stp32_m, vec3_m, vec5_m); \ - \ - SPLATI_H2_SH(coeff_m, 2, 3, cnst0_m, cnst1_m); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - stp26_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ - \ - cnst0_m = __msa_splati_h(coeff_m, 4); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - stp21_m = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ - \ - SPLATI_H2_SH(coeff_m, 5, 2, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - stp25_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ - \ - cnst0_m = __msa_splati_h(coeff_m, 3); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - stp22_m = DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m); \ - \ - /* stp4 */ \ - BUTTERFLY_4(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m, vec4_m, \ - vec5_m); \ - BUTTERFLY_4(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m, stp24_m, \ - stp31_m); \ - \ - ILVRL_H2_SH(vec2_m, vec6_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 0, 1, cnst0_m, cnst1_m); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - \ - out1 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - cnst0_m = __msa_splati_h(coeff2_m, 0); \ - cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - out15 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - ILVRL_H2_SH(vec4_m, vec5_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - \ - out9 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ - \ - cnst1_m = __msa_splati_h(coeff2_m, 2); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - out7 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - ILVRL_H2_SH(stp23_m, stp21_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 4, 5, cnst0_m, cnst1_m); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - cnst0_m = __msa_splati_h(coeff2_m, 1); \ - cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - out11 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - \ - ILVRL_H2_SH(stp24_m, stp31_m, vec1_m, vec0_m); \ - SPLATI_H2_SH(coeff1_m, 6, 7, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - \ - out13 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ - \ - cnst1_m = __msa_splati_h(coeff2_m, 3); \ - cnst0_m = __msa_ilvev_h(cnst0_m, cnst1_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - } - -#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \ - { \ - v8i16 tp0_m, tp1_m; \ - v8i16 one_m = __msa_ldi_h(1); \ - \ - tp0_m = __msa_clti_s_h(vec0, 0); \ - tp1_m = __msa_clti_s_h(vec1, 0); \ - vec0 += 1; \ - vec1 += 1; \ - tp0_m = one_m & tp0_m; \ - tp1_m = one_m & tp1_m; \ - vec0 += tp0_m; \ - vec1 += tp1_m; \ - vec0 >>= 2; \ - vec1 >>= 2; \ - } - -#define FDCT32_POSTPROC_NEG_W(vec) \ - { \ - v4i32 temp_m; \ - v4i32 one_m = __msa_ldi_w(1); \ - \ - temp_m = __msa_clti_s_w(vec, 0); \ - vec += 1; \ - temp_m = one_m & temp_m; \ - vec += temp_m; \ - vec >>= 2; \ - } - -#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \ - { \ - v8i16 tp0_m, tp1_m; \ - v8i16 one = __msa_ldi_h(1); \ - \ - tp0_m = __msa_clei_s_h(vec0, 0); \ - tp1_m = __msa_clei_s_h(vec1, 0); \ - tp0_m = (v8i16)__msa_xori_b((v16u8)tp0_m, 255); \ - tp1_m = (v8i16)__msa_xori_b((v16u8)tp1_m, 255); \ - vec0 += 1; \ - vec1 += 1; \ - tp0_m = one & tp0_m; \ - tp1_m = one & tp1_m; \ - vec0 += tp0_m; \ - vec1 += tp1_m; \ - vec0 >>= 2; \ - vec1 >>= 2; \ - } - -#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right, \ - const0, const1, out0, out1, out2, out3) \ - { \ - v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m; \ - v2i64 tp0_m, tp1_m, tp2_m, tp3_m; \ - v4i32 k0_m = __msa_fill_w((int32_t)const0); \ - \ - s0_m = __msa_fill_w((int32_t)const1); \ - k0_m = __msa_ilvev_w(s0_m, k0_m); \ - \ - ILVRL_W2_SW(-reg1_left, reg0_left, s1_m, s0_m); \ - ILVRL_W2_SW(reg0_left, reg1_left, s3_m, s2_m); \ - ILVRL_W2_SW(-reg1_right, reg0_right, s5_m, s4_m); \ - ILVRL_W2_SW(reg0_right, reg1_right, s7_m, s6_m); \ - \ - DOTP_SW2_SD(s0_m, s1_m, k0_m, k0_m, tp0_m, tp1_m); \ - DOTP_SW2_SD(s4_m, s5_m, k0_m, k0_m, tp2_m, tp3_m); \ - tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ - tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ - tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ - tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ - out0 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ - out1 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ - \ - DOTP_SW2_SD(s2_m, s3_m, k0_m, k0_m, tp0_m, tp1_m); \ - DOTP_SW2_SD(s6_m, s7_m, k0_m, k0_m, tp2_m, tp3_m); \ - tp0_m = __msa_srari_d(tp0_m, DCT_CONST_BITS); \ - tp1_m = __msa_srari_d(tp1_m, DCT_CONST_BITS); \ - tp2_m = __msa_srari_d(tp2_m, DCT_CONST_BITS); \ - tp3_m = __msa_srari_d(tp3_m, DCT_CONST_BITS); \ - out2 = __msa_pckev_w((v4i32)tp0_m, (v4i32)tp1_m); \ - out3 = __msa_pckev_w((v4i32)tp2_m, (v4i32)tp3_m); \ - } - -void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, - int32_t src_stride); -void fdct16x8_1d_row(int16_t *input, int16_t *output); -#endif // AOM_DSP_MIPS_FWD_TXFM_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/idct16x16_msa.c b/third_party/aom/aom_dsp/mips/idct16x16_msa.c deleted file mode 100644 index 0ea127f52..000000000 --- a/third_party/aom/aom_dsp/mips/idct16x16_msa.c +++ /dev/null @@ -1,486 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/inv_txfm_msa.h" - -void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output) { - v8i16 loc0, loc1, loc2, loc3; - v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; - v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; - v8i16 tmp5, tmp6, tmp7; - - LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - input += 8; - LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); - - TRANSPOSE8x8_SH_SH(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg0, reg1, - reg2, reg3, reg4, reg5, reg6, reg7); - TRANSPOSE8x8_SH_SH(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15, reg8, - reg9, reg10, reg11, reg12, reg13, reg14, reg15); - DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); - DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); - BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); - DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); - DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); - DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); - BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); - SUB4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg0, reg12, reg4, - reg8); - ADD4(reg2, loc1, reg14, loc0, reg6, loc3, reg10, loc2, reg2, reg14, reg6, - reg10); - - /* stage 2 */ - DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); - DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); - - reg9 = reg1 - loc2; - reg1 = reg1 + loc2; - reg7 = reg15 - loc3; - reg15 = reg15 + loc3; - - DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); - DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); - BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); - - loc1 = reg15 + reg3; - reg3 = reg15 - reg3; - loc2 = reg2 + loc1; - reg15 = reg2 - loc1; - - loc1 = reg1 + reg13; - reg13 = reg1 - reg13; - loc0 = reg0 + loc1; - loc1 = reg0 - loc1; - tmp6 = loc0; - tmp7 = loc1; - reg0 = loc2; - - DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); - DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); - - loc0 = reg9 + reg5; - reg5 = reg9 - reg5; - reg2 = reg6 + loc0; - reg1 = reg6 - loc0; - - loc0 = reg7 + reg11; - reg11 = reg7 - reg11; - loc1 = reg4 + loc0; - loc2 = reg4 - loc0; - tmp5 = loc1; - - DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); - BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); - - reg10 = loc0; - reg11 = loc1; - - DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); - BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); - - reg13 = loc2; - - /* Transpose and store the output */ - reg12 = tmp5; - reg14 = tmp6; - reg3 = tmp7; - - /* transpose block */ - TRANSPOSE8x8_SH_SH(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, reg0, - reg2, reg4, reg6, reg8, reg10, reg12, reg14); - ST_SH8(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14, output, 16); - - /* transpose block */ - TRANSPOSE8x8_SH_SH(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, reg3, - reg13, reg11, reg5, reg7, reg9, reg1, reg15); - ST_SH8(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15, (output + 8), 16); -} - -void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 loc0, loc1, loc2, loc3; - v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14; - v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15; - v8i16 tmp5, tmp6, tmp7; - - /* load up 8x8 */ - LD_SH8(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - input += 8 * 16; - /* load bottom 8x8 */ - LD_SH8(input, 16, reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15); - - DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14); - DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6); - BUTTERFLY_4(reg2, reg14, reg6, reg10, loc0, loc1, reg14, reg2); - DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3); - DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8); - DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12); - BUTTERFLY_4(reg8, reg0, reg4, reg12, reg2, reg6, reg10, reg14); - - reg0 = reg2 - loc1; - reg2 = reg2 + loc1; - reg12 = reg14 - loc0; - reg14 = reg14 + loc0; - reg4 = reg6 - loc3; - reg6 = reg6 + loc3; - reg8 = reg10 - loc2; - reg10 = reg10 + loc2; - - /* stage 2 */ - DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15); - DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3); - - reg9 = reg1 - loc2; - reg1 = reg1 + loc2; - reg7 = reg15 - loc3; - reg15 = reg15 + loc3; - - DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11); - DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1); - BUTTERFLY_4(loc0, loc1, reg11, reg5, reg13, reg3, reg11, reg5); - - loc1 = reg15 + reg3; - reg3 = reg15 - reg3; - loc2 = reg2 + loc1; - reg15 = reg2 - loc1; - - loc1 = reg1 + reg13; - reg13 = reg1 - reg13; - loc0 = reg0 + loc1; - loc1 = reg0 - loc1; - tmp6 = loc0; - tmp7 = loc1; - reg0 = loc2; - - DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9); - DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11); - - loc0 = reg9 + reg5; - reg5 = reg9 - reg5; - reg2 = reg6 + loc0; - reg1 = reg6 - loc0; - - loc0 = reg7 + reg11; - reg11 = reg7 - reg11; - loc1 = reg4 + loc0; - loc2 = reg4 - loc0; - tmp5 = loc1; - - DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11); - BUTTERFLY_4(reg8, reg10, reg11, reg5, loc0, reg4, reg9, loc1); - - reg10 = loc0; - reg11 = loc1; - - DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13); - BUTTERFLY_4(reg12, reg14, reg13, reg3, reg8, reg6, reg7, reg5); - reg13 = loc2; - - /* Transpose and store the output */ - reg12 = tmp5; - reg14 = tmp6; - reg3 = tmp7; - - SRARI_H4_SH(reg0, reg2, reg4, reg6, 6); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg0, reg2, reg4, reg6); - dst += (4 * dst_stride); - SRARI_H4_SH(reg8, reg10, reg12, reg14, 6); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg8, reg10, reg12, reg14); - dst += (4 * dst_stride); - SRARI_H4_SH(reg3, reg13, reg11, reg5, 6); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg3, reg13, reg11, reg5); - dst += (4 * dst_stride); - SRARI_H4_SH(reg7, reg9, reg1, reg15, 6); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, reg7, reg9, reg1, reg15); -} - -void aom_idct16x16_256_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); - int16_t *out = out_arr; - - /* transform rows */ - for (i = 0; i < 2; ++i) { - /* process 16 * 8 block */ - aom_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7))); - } - - /* transform columns */ - for (i = 0; i < 2; ++i) { - /* process 8 * 16 block */ - aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), - dst_stride); - } -} - -void aom_idct16x16_10_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - uint8_t i; - DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]); - int16_t *out = out_arr; - - /* process 16 * 8 block */ - aom_idct16_1d_rows_msa(input, out); - - /* short case just considers top 4 rows as valid output */ - out += 4 * 16; - for (i = 12; i--;) { - __asm__ __volatile__( - "sw $zero, 0(%[out]) \n\t" - "sw $zero, 4(%[out]) \n\t" - "sw $zero, 8(%[out]) \n\t" - "sw $zero, 12(%[out]) \n\t" - "sw $zero, 16(%[out]) \n\t" - "sw $zero, 20(%[out]) \n\t" - "sw $zero, 24(%[out]) \n\t" - "sw $zero, 28(%[out]) \n\t" - - : - : [out] "r"(out)); - - out += 16; - } - - out = out_arr; - - /* transform columns */ - for (i = 0; i < 2; ++i) { - /* process 8 * 16 block */ - aom_idct16_1d_columns_addblk_msa((out + (i << 3)), (dst + (i << 3)), - dst_stride); - } -} - -void aom_idct16x16_1_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - uint8_t i; - int16_t out; - v8i16 vec, res0, res1, res2, res3, res4, res5, res6, res7; - v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; - - out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO(out, 6); - - vec = __msa_fill_h(out); - - for (i = 4; i--;) { - LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - UNPCK_UB_SH(dst0, res0, res4); - UNPCK_UB_SH(dst1, res1, res5); - UNPCK_UB_SH(dst2, res2, res6); - UNPCK_UB_SH(dst3, res3, res7); - ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); - ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); - CLIP_SH4_0_255(res0, res1, res2, res3); - CLIP_SH4_0_255(res4, res5, res6, res7); - PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, - tmp2, tmp3); - ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) { - v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; - v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15; - - /* load input data */ - LD_SH16(input, 8, l0, l8, l1, l9, l2, l10, l3, l11, l4, l12, l5, l13, l6, l14, - l7, l15); - TRANSPOSE8x8_SH_SH(l0, l1, l2, l3, l4, l5, l6, l7, l0, l1, l2, l3, l4, l5, l6, - l7); - TRANSPOSE8x8_SH_SH(l8, l9, l10, l11, l12, l13, l14, l15, l8, l9, l10, l11, - l12, l13, l14, l15); - - /* ADST in horizontal */ - AOM_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, - l14, l15, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, - r12, r13, r14, r15); - - l1 = -r8; - l3 = -r4; - l13 = -r13; - l15 = -r1; - - TRANSPOSE8x8_SH_SH(r0, l1, r12, l3, r6, r14, r10, r2, l0, l1, l2, l3, l4, l5, - l6, l7); - ST_SH8(l0, l1, l2, l3, l4, l5, l6, l7, output, 16); - TRANSPOSE8x8_SH_SH(r3, r11, r15, r7, r5, l13, r9, l15, l8, l9, l10, l11, l12, - l13, l14, l15); - ST_SH8(l8, l9, l10, l11, l12, l13, l14, l15, (output + 8), 16); -} - -void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 v0, v2, v4, v6, k0, k1, k2, k3; - v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; - v8i16 out0, out1, out2, out3, out4, out5, out6, out7; - v8i16 out8, out9, out10, out11, out12, out13, out14, out15; - v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15; - v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11; - v8i16 res0, res1, res2, res3, res4, res5, res6, res7; - v8i16 res8, res9, res10, res11, res12, res13, res14, res15; - v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; - v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15; - v16i8 zero = { 0 }; - - r0 = LD_SH(input + 0 * 16); - r3 = LD_SH(input + 3 * 16); - r4 = LD_SH(input + 4 * 16); - r7 = LD_SH(input + 7 * 16); - r8 = LD_SH(input + 8 * 16); - r11 = LD_SH(input + 11 * 16); - r12 = LD_SH(input + 12 * 16); - r15 = LD_SH(input + 15 * 16); - - /* stage 1 */ - k0 = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); - k1 = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); - k2 = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); - k3 = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); - MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3); - k0 = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); - k1 = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); - k2 = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); - k3 = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); - MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11); - BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0); - k0 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); - k1 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); - k2 = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); - MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3); - - r1 = LD_SH(input + 1 * 16); - r2 = LD_SH(input + 2 * 16); - r5 = LD_SH(input + 5 * 16); - r6 = LD_SH(input + 6 * 16); - r9 = LD_SH(input + 9 * 16); - r10 = LD_SH(input + 10 * 16); - r13 = LD_SH(input + 13 * 16); - r14 = LD_SH(input + 14 * 16); - - k0 = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); - k1 = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); - k2 = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); - k3 = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); - MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7); - k0 = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); - k1 = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); - k2 = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); - k3 = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); - MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15); - BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4); - BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10); - out1 = -out1; - SRARI_H2_SH(out0, out1, 6); - dst0 = LD_UB(dst + 0 * dst_stride); - dst1 = LD_UB(dst + 15 * dst_stride); - ILVR_B2_SH(zero, dst0, zero, dst1, res0, res1); - ADD2(res0, out0, res1, out1, res0, res1); - CLIP_SH2_0_255(res0, res1); - PCKEV_B2_SH(res0, res0, res1, res1, res0, res1); - ST8x1_UB(res0, dst); - ST8x1_UB(res1, dst + 15 * dst_stride); - - k0 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); - k1 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); - k2 = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); - MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7); - BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10); - out8 = -out8; - - SRARI_H2_SH(out8, out9, 6); - dst8 = LD_UB(dst + 1 * dst_stride); - dst9 = LD_UB(dst + 14 * dst_stride); - ILVR_B2_SH(zero, dst8, zero, dst9, res8, res9); - ADD2(res8, out8, res9, out9, res8, res9); - CLIP_SH2_0_255(res8, res9); - PCKEV_B2_SH(res8, res8, res9, res9, res8, res9); - ST8x1_UB(res8, dst + dst_stride); - ST8x1_UB(res9, dst + 14 * dst_stride); - - k0 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); - k1 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); - k2 = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); - MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7); - out4 = -out4; - SRARI_H2_SH(out4, out5, 6); - dst4 = LD_UB(dst + 3 * dst_stride); - dst5 = LD_UB(dst + 12 * dst_stride); - ILVR_B2_SH(zero, dst4, zero, dst5, res4, res5); - ADD2(res4, out4, res5, out5, res4, res5); - CLIP_SH2_0_255(res4, res5); - PCKEV_B2_SH(res4, res4, res5, res5, res4, res5); - ST8x1_UB(res4, dst + 3 * dst_stride); - ST8x1_UB(res5, dst + 12 * dst_stride); - - MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15); - out13 = -out13; - SRARI_H2_SH(out12, out13, 6); - dst12 = LD_UB(dst + 2 * dst_stride); - dst13 = LD_UB(dst + 13 * dst_stride); - ILVR_B2_SH(zero, dst12, zero, dst13, res12, res13); - ADD2(res12, out12, res13, out13, res12, res13); - CLIP_SH2_0_255(res12, res13); - PCKEV_B2_SH(res12, res12, res13, res13, res12, res13); - ST8x1_UB(res12, dst + 2 * dst_stride); - ST8x1_UB(res13, dst + 13 * dst_stride); - - k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); - k3 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); - MADD_SHORT(out6, out7, k0, k3, out6, out7); - SRARI_H2_SH(out6, out7, 6); - dst6 = LD_UB(dst + 4 * dst_stride); - dst7 = LD_UB(dst + 11 * dst_stride); - ILVR_B2_SH(zero, dst6, zero, dst7, res6, res7); - ADD2(res6, out6, res7, out7, res6, res7); - CLIP_SH2_0_255(res6, res7); - PCKEV_B2_SH(res6, res6, res7, res7, res6, res7); - ST8x1_UB(res6, dst + 4 * dst_stride); - ST8x1_UB(res7, dst + 11 * dst_stride); - - MADD_SHORT(out10, out11, k0, k3, out10, out11); - SRARI_H2_SH(out10, out11, 6); - dst10 = LD_UB(dst + 6 * dst_stride); - dst11 = LD_UB(dst + 9 * dst_stride); - ILVR_B2_SH(zero, dst10, zero, dst11, res10, res11); - ADD2(res10, out10, res11, out11, res10, res11); - CLIP_SH2_0_255(res10, res11); - PCKEV_B2_SH(res10, res10, res11, res11, res10, res11); - ST8x1_UB(res10, dst + 6 * dst_stride); - ST8x1_UB(res11, dst + 9 * dst_stride); - - k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); - k2 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); - MADD_SHORT(h10, h11, k1, k2, out2, out3); - SRARI_H2_SH(out2, out3, 6); - dst2 = LD_UB(dst + 7 * dst_stride); - dst3 = LD_UB(dst + 8 * dst_stride); - ILVR_B2_SH(zero, dst2, zero, dst3, res2, res3); - ADD2(res2, out2, res3, out3, res2, res3); - CLIP_SH2_0_255(res2, res3); - PCKEV_B2_SH(res2, res2, res3, res3, res2, res3); - ST8x1_UB(res2, dst + 7 * dst_stride); - ST8x1_UB(res3, dst + 8 * dst_stride); - - MADD_SHORT(out14, out15, k1, k2, out14, out15); - SRARI_H2_SH(out14, out15, 6); - dst14 = LD_UB(dst + 5 * dst_stride); - dst15 = LD_UB(dst + 10 * dst_stride); - ILVR_B2_SH(zero, dst14, zero, dst15, res14, res15); - ADD2(res14, out14, res15, out15, res14, res15); - CLIP_SH2_0_255(res14, res15); - PCKEV_B2_SH(res14, res14, res15, res15, res14, res15); - ST8x1_UB(res14, dst + 5 * dst_stride); - ST8x1_UB(res15, dst + 10 * dst_stride); -} diff --git a/third_party/aom/aom_dsp/mips/idct32x32_msa.c b/third_party/aom/aom_dsp/mips/idct32x32_msa.c deleted file mode 100644 index f1ca757a0..000000000 --- a/third_party/aom/aom_dsp/mips/idct32x32_msa.c +++ /dev/null @@ -1,730 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/inv_txfm_msa.h" - -static void idct32x8_row_transpose_store(const int16_t *input, - int16_t *tmp_buf) { - v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; - - /* 1st & 2nd 8x8 */ - LD_SH8(input, 32, m0, n0, m1, n1, m2, n2, m3, n3); - LD_SH8((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, - n3); - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, - n7); - ST_SH8(m0, n0, m1, n1, m2, n2, m3, n3, (tmp_buf), 8); - ST_SH4(m4, n4, m5, n5, (tmp_buf + 8 * 8), 8); - ST_SH4(m6, n6, m7, n7, (tmp_buf + 12 * 8), 8); - - /* 3rd & 4th 8x8 */ - LD_SH8((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3); - LD_SH8((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, - n3); - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, - n7); - ST_SH4(m0, n0, m1, n1, (tmp_buf + 16 * 8), 8); - ST_SH4(m2, n2, m3, n3, (tmp_buf + 20 * 8), 8); - ST_SH4(m4, n4, m5, n5, (tmp_buf + 24 * 8), 8); - ST_SH4(m6, n6, m7, n7, (tmp_buf + 28 * 8), 8); -} - -static void idct32x8_row_even_process_store(int16_t *tmp_buf, - int16_t *tmp_eve_buf) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; - - /* Even stage 1 */ - LD_SH8(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - - DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); - DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); - BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); - DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - - loc1 = vec3; - loc0 = vec1; - - DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); - DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); - BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); - BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); - BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); - - /* Even stage 2 */ - LD_SH8((tmp_buf + 16), 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); - DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); - - vec0 = reg0 + reg4; - reg0 = reg0 - reg4; - reg4 = reg6 + reg2; - reg6 = reg6 - reg2; - reg2 = reg1 + reg5; - reg1 = reg1 - reg5; - reg5 = reg7 + reg3; - reg7 = reg7 - reg3; - reg3 = vec0; - - vec1 = reg2; - reg2 = reg3 + reg4; - reg3 = reg3 - reg4; - reg4 = reg5 - vec1; - reg5 = reg5 + vec1; - - DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); - DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); - - vec0 = reg0 - reg6; - reg0 = reg0 + reg6; - vec1 = reg7 - reg1; - reg7 = reg7 + reg1; - - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); - DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); - - /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ - BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); - ST_SH(loc0, (tmp_eve_buf + 15 * 8)); - ST_SH(loc1, (tmp_eve_buf)); - ST_SH(loc2, (tmp_eve_buf + 14 * 8)); - ST_SH(loc3, (tmp_eve_buf + 8)); - - BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); - ST_SH(loc0, (tmp_eve_buf + 13 * 8)); - ST_SH(loc1, (tmp_eve_buf + 2 * 8)); - ST_SH(loc2, (tmp_eve_buf + 12 * 8)); - ST_SH(loc3, (tmp_eve_buf + 3 * 8)); - - /* Store 8 */ - BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); - ST_SH(loc0, (tmp_eve_buf + 11 * 8)); - ST_SH(loc1, (tmp_eve_buf + 4 * 8)); - ST_SH(loc2, (tmp_eve_buf + 10 * 8)); - ST_SH(loc3, (tmp_eve_buf + 5 * 8)); - - BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); - ST_SH(loc0, (tmp_eve_buf + 9 * 8)); - ST_SH(loc1, (tmp_eve_buf + 6 * 8)); - ST_SH(loc2, (tmp_eve_buf + 8 * 8)); - ST_SH(loc3, (tmp_eve_buf + 7 * 8)); -} - -static void idct32x8_row_odd_process_store(int16_t *tmp_buf, - int16_t *tmp_odd_buf) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - - /* Odd stage 1 */ - reg0 = LD_SH(tmp_buf + 8); - reg1 = LD_SH(tmp_buf + 7 * 8); - reg2 = LD_SH(tmp_buf + 9 * 8); - reg3 = LD_SH(tmp_buf + 15 * 8); - reg4 = LD_SH(tmp_buf + 17 * 8); - reg5 = LD_SH(tmp_buf + 23 * 8); - reg6 = LD_SH(tmp_buf + 25 * 8); - reg7 = LD_SH(tmp_buf + 31 * 8); - - DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); - DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); - - vec0 = reg0 + reg3; - reg0 = reg0 - reg3; - reg3 = reg7 + reg4; - reg7 = reg7 - reg4; - reg4 = reg1 + reg2; - reg1 = reg1 - reg2; - reg2 = reg6 + reg5; - reg6 = reg6 - reg5; - reg5 = vec0; - - /* 4 Stores */ - ADD2(reg5, reg4, reg3, reg2, vec0, vec1); - ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); - - SUB2(reg5, reg4, reg3, reg2, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); - ST_SH2(vec0, vec1, (tmp_odd_buf), 8); - - /* 4 Stores */ - DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); - DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); - BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); - ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); - - DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); - ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); - - /* Odd stage 2 */ - /* 8 loads */ - reg0 = LD_SH(tmp_buf + 3 * 8); - reg1 = LD_SH(tmp_buf + 5 * 8); - reg2 = LD_SH(tmp_buf + 11 * 8); - reg3 = LD_SH(tmp_buf + 13 * 8); - reg4 = LD_SH(tmp_buf + 19 * 8); - reg5 = LD_SH(tmp_buf + 21 * 8); - reg6 = LD_SH(tmp_buf + 27 * 8); - reg7 = LD_SH(tmp_buf + 29 * 8); - - DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); - DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); - DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); - DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); - - /* 4 Stores */ - SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); - DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); - DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); - - BUTTERFLY_4(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3); - ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); - - DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); - ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); - - /* 4 Stores */ - ADD4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1, vec2, vec0, vec3); - BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); - ST_SH(reg0, (tmp_odd_buf + 13 * 8)); - ST_SH(reg1, (tmp_odd_buf + 14 * 8)); - - DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); - ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); - - /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ - - /* Load 8 & Store 8 */ - LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); - LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); - - ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); - - SUB2(reg0, reg4, reg1, reg5, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - SUB2(reg2, reg6, reg3, reg7, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); - - /* Load 8 & Store 8 */ - LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); - LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); - - ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); - - SUB2(reg0, reg4, reg3, reg7, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - SUB2(reg1, reg5, reg2, reg6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); -} - -static void idct_butterfly_transpose_store(int16_t *tmp_buf, - int16_t *tmp_eve_buf, - int16_t *tmp_odd_buf, int16_t *dst) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; - - /* FINAL BUTTERFLY : Dependency on Even & Odd */ - vec0 = LD_SH(tmp_odd_buf); - vec1 = LD_SH(tmp_odd_buf + 9 * 8); - vec2 = LD_SH(tmp_odd_buf + 14 * 8); - vec3 = LD_SH(tmp_odd_buf + 6 * 8); - loc0 = LD_SH(tmp_eve_buf); - loc1 = LD_SH(tmp_eve_buf + 8 * 8); - loc2 = LD_SH(tmp_eve_buf + 4 * 8); - loc3 = LD_SH(tmp_eve_buf + 12 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); - - ST_SH((loc0 - vec3), (tmp_buf + 31 * 8)); - ST_SH((loc1 - vec2), (tmp_buf + 23 * 8)); - ST_SH((loc2 - vec1), (tmp_buf + 27 * 8)); - ST_SH((loc3 - vec0), (tmp_buf + 19 * 8)); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 4 * 8); - vec1 = LD_SH(tmp_odd_buf + 13 * 8); - vec2 = LD_SH(tmp_odd_buf + 10 * 8); - vec3 = LD_SH(tmp_odd_buf + 3 * 8); - loc0 = LD_SH(tmp_eve_buf + 2 * 8); - loc1 = LD_SH(tmp_eve_buf + 10 * 8); - loc2 = LD_SH(tmp_eve_buf + 6 * 8); - loc3 = LD_SH(tmp_eve_buf + 14 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); - - ST_SH((loc0 - vec3), (tmp_buf + 29 * 8)); - ST_SH((loc1 - vec2), (tmp_buf + 21 * 8)); - ST_SH((loc2 - vec1), (tmp_buf + 25 * 8)); - ST_SH((loc3 - vec0), (tmp_buf + 17 * 8)); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 2 * 8); - vec1 = LD_SH(tmp_odd_buf + 11 * 8); - vec2 = LD_SH(tmp_odd_buf + 12 * 8); - vec3 = LD_SH(tmp_odd_buf + 7 * 8); - loc0 = LD_SH(tmp_eve_buf + 1 * 8); - loc1 = LD_SH(tmp_eve_buf + 9 * 8); - loc2 = LD_SH(tmp_eve_buf + 5 * 8); - loc3 = LD_SH(tmp_eve_buf + 13 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); - - ST_SH((loc0 - vec3), (tmp_buf + 30 * 8)); - ST_SH((loc1 - vec2), (tmp_buf + 22 * 8)); - ST_SH((loc2 - vec1), (tmp_buf + 26 * 8)); - ST_SH((loc3 - vec0), (tmp_buf + 18 * 8)); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 5 * 8); - vec1 = LD_SH(tmp_odd_buf + 15 * 8); - vec2 = LD_SH(tmp_odd_buf + 8 * 8); - vec3 = LD_SH(tmp_odd_buf + 1 * 8); - loc0 = LD_SH(tmp_eve_buf + 3 * 8); - loc1 = LD_SH(tmp_eve_buf + 11 * 8); - loc2 = LD_SH(tmp_eve_buf + 7 * 8); - loc3 = LD_SH(tmp_eve_buf + 15 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); - - ST_SH((loc0 - vec3), (tmp_buf + 28 * 8)); - ST_SH((loc1 - vec2), (tmp_buf + 20 * 8)); - ST_SH((loc2 - vec1), (tmp_buf + 24 * 8)); - ST_SH((loc3 - vec0), (tmp_buf + 16 * 8)); - - /* Transpose : 16 vectors */ - /* 1st & 2nd 8x8 */ - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, - n3); - ST_SH4(m0, n0, m1, n1, (dst + 0), 32); - ST_SH4(m2, n2, m3, n3, (dst + 4 * 32), 32); - - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, - n7); - ST_SH4(m4, n4, m5, n5, (dst + 8), 32); - ST_SH4(m6, n6, m7, n7, (dst + 8 + 4 * 32), 32); - - /* 3rd & 4th 8x8 */ - LD_SH8((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3); - LD_SH8((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7); - TRANSPOSE8x8_SH_SH(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3, - n3); - ST_SH4(m0, n0, m1, n1, (dst + 16), 32); - ST_SH4(m2, n2, m3, n3, (dst + 16 + 4 * 32), 32); - - TRANSPOSE8x8_SH_SH(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7, - n7); - ST_SH4(m4, n4, m5, n5, (dst + 24), 32); - ST_SH4(m6, n6, m7, n7, (dst + 24 + 4 * 32), 32); -} - -static void idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) { - DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]); - DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); - DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); - - idct32x8_row_transpose_store(input, &tmp_buf[0]); - idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]); - idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]); - idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0], - output); -} - -static void idct8x32_column_even_process_store(int16_t *tmp_buf, - int16_t *tmp_eve_buf) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7; - - /* Even stage 1 */ - LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - tmp_buf += (2 * 32); - - DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7); - DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3); - BUTTERFLY_4(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0); - DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - - loc1 = vec3; - loc0 = vec1; - - DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4); - DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6); - BUTTERFLY_4(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0); - BUTTERFLY_4(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4); - BUTTERFLY_4(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5); - - /* Even stage 2 */ - /* Load 8 */ - LD_SH8(tmp_buf, (4 * 32), reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7); - - DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3); - DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1); - - vec0 = reg0 + reg4; - reg0 = reg0 - reg4; - reg4 = reg6 + reg2; - reg6 = reg6 - reg2; - reg2 = reg1 + reg5; - reg1 = reg1 - reg5; - reg5 = reg7 + reg3; - reg7 = reg7 - reg3; - reg3 = vec0; - - vec1 = reg2; - reg2 = reg3 + reg4; - reg3 = reg3 - reg4; - reg4 = reg5 - vec1; - reg5 = reg5 + vec1; - - DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7); - DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1); - - vec0 = reg0 - reg6; - reg0 = reg0 + reg6; - vec1 = reg7 - reg1; - reg7 = reg7 + reg1; - - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1); - DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4); - - /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */ - /* Store 8 */ - BUTTERFLY_4(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0); - ST_SH2(loc1, loc3, tmp_eve_buf, 8); - ST_SH2(loc2, loc0, (tmp_eve_buf + 14 * 8), 8); - - BUTTERFLY_4(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0); - ST_SH2(loc1, loc3, (tmp_eve_buf + 2 * 8), 8); - ST_SH2(loc2, loc0, (tmp_eve_buf + 12 * 8), 8); - - /* Store 8 */ - BUTTERFLY_4(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0); - ST_SH2(loc1, loc3, (tmp_eve_buf + 4 * 8), 8); - ST_SH2(loc2, loc0, (tmp_eve_buf + 10 * 8), 8); - - BUTTERFLY_4(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0); - ST_SH2(loc1, loc3, (tmp_eve_buf + 6 * 8), 8); - ST_SH2(loc2, loc0, (tmp_eve_buf + 8 * 8), 8); -} - -static void idct8x32_column_odd_process_store(int16_t *tmp_buf, - int16_t *tmp_odd_buf) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; - - /* Odd stage 1 */ - reg0 = LD_SH(tmp_buf + 32); - reg1 = LD_SH(tmp_buf + 7 * 32); - reg2 = LD_SH(tmp_buf + 9 * 32); - reg3 = LD_SH(tmp_buf + 15 * 32); - reg4 = LD_SH(tmp_buf + 17 * 32); - reg5 = LD_SH(tmp_buf + 23 * 32); - reg6 = LD_SH(tmp_buf + 25 * 32); - reg7 = LD_SH(tmp_buf + 31 * 32); - - DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7); - DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4); - DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5); - DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6); - - vec0 = reg0 + reg3; - reg0 = reg0 - reg3; - reg3 = reg7 + reg4; - reg7 = reg7 - reg4; - reg4 = reg1 + reg2; - reg1 = reg1 - reg2; - reg2 = reg6 + reg5; - reg6 = reg6 - reg5; - reg5 = vec0; - - /* 4 Stores */ - ADD2(reg5, reg4, reg3, reg2, vec0, vec1); - ST_SH2(vec0, vec1, (tmp_odd_buf + 4 * 8), 8); - SUB2(reg5, reg4, reg3, reg2, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1); - ST_SH2(vec0, vec1, tmp_odd_buf, 8); - - /* 4 Stores */ - DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7); - DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6); - BUTTERFLY_4(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3); - ST_SH2(vec0, vec1, (tmp_odd_buf + 6 * 8), 8); - DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3); - ST_SH2(vec2, vec3, (tmp_odd_buf + 2 * 8), 8); - - /* Odd stage 2 */ - /* 8 loads */ - reg0 = LD_SH(tmp_buf + 3 * 32); - reg1 = LD_SH(tmp_buf + 5 * 32); - reg2 = LD_SH(tmp_buf + 11 * 32); - reg3 = LD_SH(tmp_buf + 13 * 32); - reg4 = LD_SH(tmp_buf + 19 * 32); - reg5 = LD_SH(tmp_buf + 21 * 32); - reg6 = LD_SH(tmp_buf + 27 * 32); - reg7 = LD_SH(tmp_buf + 29 * 32); - - DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6); - DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5); - DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4); - DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7); - - /* 4 Stores */ - SUB4(reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0, vec1, vec2, vec3); - DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1); - DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3); - BUTTERFLY_4(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2); - ST_SH2(vec0, vec1, (tmp_odd_buf + 12 * 8), 3 * 8); - DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1); - ST_SH2(vec0, vec1, (tmp_odd_buf + 10 * 8), 8); - - /* 4 Stores */ - ADD4(reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0, vec1, vec2, vec3); - BUTTERFLY_4(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2); - ST_SH2(reg0, reg1, (tmp_odd_buf + 13 * 8), 8); - DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1); - ST_SH2(reg0, reg1, (tmp_odd_buf + 8 * 8), 8); - - /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */ - /* Load 8 & Store 8 */ - LD_SH4(tmp_odd_buf, 8, reg0, reg1, reg2, reg3); - LD_SH4((tmp_odd_buf + 8 * 8), 8, reg4, reg5, reg6, reg7); - - ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, tmp_odd_buf, 8); - - SUB2(reg0, reg4, reg1, reg5, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - SUB2(reg2, reg6, reg3, reg7, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 8 * 8), 8); - - /* Load 8 & Store 8 */ - LD_SH4((tmp_odd_buf + 4 * 8), 8, reg1, reg2, reg0, reg3); - LD_SH4((tmp_odd_buf + 12 * 8), 8, reg4, reg5, reg6, reg7); - - ADD4(reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0, loc1, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 4 * 8), 8); - - SUB2(reg0, reg4, reg3, reg7, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1); - - SUB2(reg1, reg5, reg2, reg6, vec0, vec1); - DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3); - ST_SH4(loc0, loc1, loc2, loc3, (tmp_odd_buf + 12 * 8), 8); -} - -static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf, - int16_t *tmp_odd_buf, uint8_t *dst, - int32_t dst_stride) { - v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3; - v8i16 m0, m1, m2, m3, m4, m5, m6, m7, n0, n1, n2, n3, n4, n5, n6, n7; - - /* FINAL BUTTERFLY : Dependency on Even & Odd */ - vec0 = LD_SH(tmp_odd_buf); - vec1 = LD_SH(tmp_odd_buf + 9 * 8); - vec2 = LD_SH(tmp_odd_buf + 14 * 8); - vec3 = LD_SH(tmp_odd_buf + 6 * 8); - loc0 = LD_SH(tmp_eve_buf); - loc1 = LD_SH(tmp_eve_buf + 8 * 8); - loc2 = LD_SH(tmp_eve_buf + 4 * 8); - loc3 = LD_SH(tmp_eve_buf + 12 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0, m4, m2, m6); - SRARI_H4_SH(m0, m2, m4, m6, 6); - AOM_ADDBLK_ST8x4_UB(dst, (4 * dst_stride), m0, m2, m4, m6); - - SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6, m2, m4, m0); - SRARI_H4_SH(m0, m2, m4, m6, 6); - AOM_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), (4 * dst_stride), m0, m2, m4, - m6); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 4 * 8); - vec1 = LD_SH(tmp_odd_buf + 13 * 8); - vec2 = LD_SH(tmp_odd_buf + 10 * 8); - vec3 = LD_SH(tmp_odd_buf + 3 * 8); - loc0 = LD_SH(tmp_eve_buf + 2 * 8); - loc1 = LD_SH(tmp_eve_buf + 10 * 8); - loc2 = LD_SH(tmp_eve_buf + 6 * 8); - loc3 = LD_SH(tmp_eve_buf + 14 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1, m5, m3, m7); - SRARI_H4_SH(m1, m3, m5, m7, 6); - AOM_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), (4 * dst_stride), m1, m3, m5, m7); - - SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7, m3, m5, m1); - SRARI_H4_SH(m1, m3, m5, m7, 6); - AOM_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), (4 * dst_stride), m1, m3, m5, - m7); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 2 * 8); - vec1 = LD_SH(tmp_odd_buf + 11 * 8); - vec2 = LD_SH(tmp_odd_buf + 12 * 8); - vec3 = LD_SH(tmp_odd_buf + 7 * 8); - loc0 = LD_SH(tmp_eve_buf + 1 * 8); - loc1 = LD_SH(tmp_eve_buf + 9 * 8); - loc2 = LD_SH(tmp_eve_buf + 5 * 8); - loc3 = LD_SH(tmp_eve_buf + 13 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0, n4, n2, n6); - SRARI_H4_SH(n0, n2, n4, n6, 6); - AOM_ADDBLK_ST8x4_UB((dst + 1 * dst_stride), (4 * dst_stride), n0, n2, n4, n6); - - SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6, n2, n4, n0); - SRARI_H4_SH(n0, n2, n4, n6, 6); - AOM_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), (4 * dst_stride), n0, n2, n4, - n6); - - /* Load 8 & Store 8 */ - vec0 = LD_SH(tmp_odd_buf + 5 * 8); - vec1 = LD_SH(tmp_odd_buf + 15 * 8); - vec2 = LD_SH(tmp_odd_buf + 8 * 8); - vec3 = LD_SH(tmp_odd_buf + 1 * 8); - loc0 = LD_SH(tmp_eve_buf + 3 * 8); - loc1 = LD_SH(tmp_eve_buf + 11 * 8); - loc2 = LD_SH(tmp_eve_buf + 7 * 8); - loc3 = LD_SH(tmp_eve_buf + 15 * 8); - - ADD4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1, n5, n3, n7); - SRARI_H4_SH(n1, n3, n5, n7, 6); - AOM_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), (4 * dst_stride), n1, n3, n5, n7); - - SUB4(loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7, n3, n5, n1); - SRARI_H4_SH(n1, n3, n5, n7, 6); - AOM_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), (4 * dst_stride), n1, n3, n5, - n7); -} - -static void idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, - int32_t dst_stride) { - DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]); - DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]); - - idct8x32_column_even_process_store(input, &tmp_eve_buf[0]); - idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]); - idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst, - dst_stride); -} - -void aom_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); - int16_t *out_ptr = out_arr; - - /* transform rows */ - for (i = 0; i < 4; ++i) { - /* process 32 * 8 block */ - idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8))); - } - - /* transform columns */ - for (i = 0; i < 4; ++i) { - /* process 8 * 32 block */ - idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), - dst_stride); - } -} - -void aom_idct32x32_34_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int32_t i; - DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]); - int16_t *out_ptr = out_arr; - - for (i = 32; i--;) { - __asm__ __volatile__( - "sw $zero, 0(%[out_ptr]) \n\t" - "sw $zero, 4(%[out_ptr]) \n\t" - "sw $zero, 8(%[out_ptr]) \n\t" - "sw $zero, 12(%[out_ptr]) \n\t" - "sw $zero, 16(%[out_ptr]) \n\t" - "sw $zero, 20(%[out_ptr]) \n\t" - "sw $zero, 24(%[out_ptr]) \n\t" - "sw $zero, 28(%[out_ptr]) \n\t" - "sw $zero, 32(%[out_ptr]) \n\t" - "sw $zero, 36(%[out_ptr]) \n\t" - "sw $zero, 40(%[out_ptr]) \n\t" - "sw $zero, 44(%[out_ptr]) \n\t" - "sw $zero, 48(%[out_ptr]) \n\t" - "sw $zero, 52(%[out_ptr]) \n\t" - "sw $zero, 56(%[out_ptr]) \n\t" - "sw $zero, 60(%[out_ptr]) \n\t" - - : - : [out_ptr] "r"(out_ptr)); - - out_ptr += 32; - } - - out_ptr = out_arr; - - /* rows: only upper-left 8x8 has non-zero coeff */ - idct32x8_1d_rows_msa(input, out_ptr); - - /* transform columns */ - for (i = 0; i < 4; ++i) { - /* process 8 * 32 block */ - idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dst + (i << 3)), - dst_stride); - } -} - -void aom_idct32x32_1_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int32_t i; - int16_t out; - v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3; - v8i16 res0, res1, res2, res3, res4, res5, res6, res7, vec; - - out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO(out, 6); - - vec = __msa_fill_h(out); - - for (i = 16; i--;) { - LD_UB2(dst, 16, dst0, dst1); - LD_UB2(dst + dst_stride, 16, dst2, dst3); - - UNPCK_UB_SH(dst0, res0, res4); - UNPCK_UB_SH(dst1, res1, res5); - UNPCK_UB_SH(dst2, res2, res6); - UNPCK_UB_SH(dst3, res3, res7); - ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); - ADD4(res4, vec, res5, vec, res6, vec, res7, vec, res4, res5, res6, res7); - CLIP_SH4_0_255(res0, res1, res2, res3); - CLIP_SH4_0_255(res4, res5, res6, res7); - PCKEV_B4_UB(res4, res0, res5, res1, res6, res2, res7, res3, tmp0, tmp1, - tmp2, tmp3); - - ST_UB2(tmp0, tmp1, dst, 16); - dst += dst_stride; - ST_UB2(tmp2, tmp3, dst, 16); - dst += dst_stride; - } -} diff --git a/third_party/aom/aom_dsp/mips/idct4x4_msa.c b/third_party/aom/aom_dsp/mips/idct4x4_msa.c deleted file mode 100644 index 274818baa..000000000 --- a/third_party/aom/aom_dsp/mips/idct4x4_msa.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/inv_txfm_msa.h" - -void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 in0, in1, in2, in3; - v4i32 in0_r, in1_r, in2_r, in3_r, in4_r; - - /* load vector elements of 4x4 block */ - LD4x4_SH(input, in0, in2, in3, in1); - TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); - UNPCK_R_SH_SW(in0, in0_r); - UNPCK_R_SH_SW(in2, in2_r); - UNPCK_R_SH_SW(in3, in3_r); - UNPCK_R_SH_SW(in1, in1_r); - SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT); - - in0_r += in2_r; - in3_r -= in1_r; - in4_r = (in0_r - in3_r) >> 1; - in1_r = in4_r - in1_r; - in2_r = in4_r - in2_r; - in0_r -= in1_r; - in3_r += in2_r; - - TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r); - - in0_r += in1_r; - in2_r -= in3_r; - in4_r = (in0_r - in2_r) >> 1; - in3_r = in4_r - in3_r; - in1_r = in4_r - in1_r; - in0_r -= in3_r; - in2_r += in1_r; - - PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1, - in2, in3); - ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); -} - -void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int16_t a1, e1; - v8i16 in1, in0 = { 0 }; - - a1 = input[0] >> UNIT_QUANT_SHIFT; - e1 = a1 >> 1; - a1 -= e1; - - in0 = __msa_insert_h(in0, 0, a1); - in0 = __msa_insert_h(in0, 1, e1); - in0 = __msa_insert_h(in0, 2, e1); - in0 = __msa_insert_h(in0, 3, e1); - - in1 = in0 >> 1; - in0 -= in1; - - ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride); -} - -void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 in0, in1, in2, in3; - - /* load vector elements of 4x4 block */ - LD4x4_SH(input, in0, in1, in2, in3); - /* rows */ - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); - /* columns */ - TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); - /* rounding (add 2^3, divide by 2^4) */ - SRARI_H4_SH(in0, in1, in2, in3, 4); - ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); -} - -void aom_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int16_t out; - v8i16 vec; - - out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO(out, 4); - vec = __msa_fill_h(out); - - ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride); -} diff --git a/third_party/aom/aom_dsp/mips/idct8x8_msa.c b/third_party/aom/aom_dsp/mips/idct8x8_msa.c deleted file mode 100644 index 981c103cd..000000000 --- a/third_party/aom/aom_dsp/mips/idct8x8_msa.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/mips/inv_txfm_msa.h" - -void aom_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - - /* load vector elements of 8x8 block */ - LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); - - /* rows transform */ - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - /* 1D idct8x8 */ - AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - /* columns transform */ - TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - /* 1D idct8x8 */ - AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - /* final rounding (add 2^4, divide by 2^5) and shift */ - SRARI_H4_SH(in0, in1, in2, in3, 5); - SRARI_H4_SH(in4, in5, in6, in7, 5); - /* add block and store 8x8 */ - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); - dst += (4 * dst_stride); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); -} - -void aom_idct8x8_12_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - v8i16 in0, in1, in2, in3, in4, in5, in6, in7; - v8i16 s0, s1, s2, s3, s4, s5, s6, s7, k0, k1, k2, k3, m0, m1, m2, m3; - v4i32 tmp0, tmp1, tmp2, tmp3; - v8i16 zero = { 0 }; - - /* load vector elements of 8x8 block */ - LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); - TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); - - /* stage1 */ - ILVL_H2_SH(in3, in0, in2, in1, s0, s1); - k0 = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); - k1 = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); - k2 = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); - k3 = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); - DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); - SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); - PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); - PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); - BUTTERFLY_4(s0, s1, s3, s2, s4, s7, s6, s5); - - /* stage2 */ - ILVR_H2_SH(in3, in1, in2, in0, s1, s0); - k0 = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); - k1 = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); - k2 = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); - k3 = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); - DOTP_SH4_SW(s0, s0, s1, s1, k0, k1, k2, k3, tmp0, tmp1, tmp2, tmp3); - SRARI_W4_SW(tmp0, tmp1, tmp2, tmp3, DCT_CONST_BITS); - PCKEV_H2_SH(zero, tmp0, zero, tmp1, s0, s1); - PCKEV_H2_SH(zero, tmp2, zero, tmp3, s2, s3); - BUTTERFLY_4(s0, s1, s2, s3, m0, m1, m2, m3); - - /* stage3 */ - s0 = __msa_ilvr_h(s6, s5); - - k1 = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); - DOTP_SH2_SW(s0, s0, k1, k0, tmp0, tmp1); - SRARI_W2_SW(tmp0, tmp1, DCT_CONST_BITS); - PCKEV_H2_SH(zero, tmp0, zero, tmp1, s2, s3); - - /* stage4 */ - BUTTERFLY_8(m0, m1, m2, m3, s4, s2, s3, s7, in0, in1, in2, in3, in4, in5, in6, - in7); - TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - /* final rounding (add 2^4, divide by 2^5) and shift */ - SRARI_H4_SH(in0, in1, in2, in3, 5); - SRARI_H4_SH(in4, in5, in6, in7, 5); - - /* add block and store 8x8 */ - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3); - dst += (4 * dst_stride); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in4, in5, in6, in7); -} - -void aom_idct8x8_1_add_msa(const int16_t *input, uint8_t *dst, - int32_t dst_stride) { - int16_t out; - int32_t val; - v8i16 vec; - - out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); - out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); - val = ROUND_POWER_OF_TWO(out, 5); - vec = __msa_fill_h(val); - - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); - dst += (4 * dst_stride); - AOM_ADDBLK_ST8x4_UB(dst, dst_stride, vec, vec, vec, vec); -} diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c index bcb9c9df9..9f25cc1ca 100644 --- a/third_party/aom/aom_dsp/mips/intrapred_msa.c +++ b/third_party/aom/aom_dsp/mips/intrapred_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/macros_msa.h" #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \ diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h b/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h deleted file mode 100644 index c69835173..000000000 --- a/third_party/aom/aom_dsp/mips/inv_txfm_dspr2.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ -#define AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ - -#include - -#include "./aom_config.h" -#include "aom/aom_integer.h" -#include "aom_dsp/inv_txfm.h" -#include "aom_dsp/mips/common_dspr2.h" - -#ifdef __cplusplus -extern "C" { -#endif - -#if HAVE_DSPR2 -/* Note: this macro expects a local int32_t named out to exist, and will write - * to that variable. */ -#define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input) \ - ({ \ - \ - int32_t tmp; \ - int dct_cost_rounding = DCT_CONST_ROUNDING; \ - int in = input; \ - \ - __asm__ __volatile__(/* out = dct_const_round_shift(dc * cospi_16_64); */ \ - "mtlo %[dct_cost_rounding], $ac1 " \ - " \n\t" \ - "mthi $zero, $ac1 " \ - " \n\t" \ - "madd $ac1, %[in], " \ - "%[cospi_16_64] \n\t" \ - "extp %[tmp], $ac1, " \ - "31 \n\t" \ - \ - /* out = dct_const_round_shift(out * cospi_16_64); */ \ - "mtlo %[dct_cost_rounding], $ac2 " \ - " \n\t" \ - "mthi $zero, $ac2 " \ - " \n\t" \ - "madd $ac2, %[tmp], " \ - "%[cospi_16_64] \n\t" \ - "extp %[out], $ac2, " \ - "31 \n\t" \ - \ - : [tmp] "=&r"(tmp), [out] "=r"(out) \ - : [in] "r"(in), \ - [dct_cost_rounding] "r"(dct_cost_rounding), \ - [cospi_16_64] "r"(cospi_16_64)); \ - out; \ - }) - -void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); -void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output); -void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); -void iadst4_dspr2(const int16_t *input, int16_t *output); -void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); -void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); -void iadst8_dspr2(const int16_t *input, int16_t *output); -void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); -void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride); -void iadst16_dspr2(const int16_t *input, int16_t *output); - -#endif // #if HAVE_DSPR2 -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_DSP_MIPS_INV_TXFM_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h b/third_party/aom/aom_dsp/mips/inv_txfm_msa.h deleted file mode 100644 index 122667aa8..000000000 --- a/third_party/aom/aom_dsp/mips/inv_txfm_msa.h +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_MIPS_INV_TXFM_MSA_H_ -#define AOM_DSP_MIPS_INV_TXFM_MSA_H_ - -#include "aom_dsp/mips/macros_msa.h" -#include "aom_dsp/mips/txfm_macros_msa.h" -#include "aom_dsp/txfm_common.h" - -#define AOM_ADST8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ - out3, out4, out5, out6, out7) \ - { \ - v8i16 cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst4_m; \ - v8i16 vec0_m, vec1_m, vec2_m, vec3_m, s0_m, s1_m; \ - v8i16 coeff0_m = { cospi_2_64, cospi_6_64, cospi_10_64, cospi_14_64, \ - cospi_18_64, cospi_22_64, cospi_26_64, cospi_30_64 }; \ - v8i16 coeff1_m = { cospi_8_64, -cospi_8_64, cospi_16_64, -cospi_16_64, \ - cospi_24_64, -cospi_24_64, 0, 0 }; \ - \ - SPLATI_H2_SH(coeff0_m, 0, 7, cnst0_m, cnst1_m); \ - cnst2_m = -cnst0_m; \ - ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ - SPLATI_H2_SH(coeff0_m, 4, 3, cnst2_m, cnst3_m); \ - cnst4_m = -cnst2_m; \ - ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ - \ - ILVRL_H2_SH(in0, in7, vec1_m, vec0_m); \ - ILVRL_H2_SH(in4, in3, vec3_m, vec2_m); \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ - cnst2_m, cnst3_m, in7, in0, in4, in3); \ - \ - SPLATI_H2_SH(coeff0_m, 2, 5, cnst0_m, cnst1_m); \ - cnst2_m = -cnst0_m; \ - ILVEV_H2_SH(cnst0_m, cnst1_m, cnst1_m, cnst2_m, cnst0_m, cnst1_m); \ - SPLATI_H2_SH(coeff0_m, 6, 1, cnst2_m, cnst3_m); \ - cnst4_m = -cnst2_m; \ - ILVEV_H2_SH(cnst2_m, cnst3_m, cnst3_m, cnst4_m, cnst2_m, cnst3_m); \ - \ - ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ - ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ - \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst1_m, \ - cnst2_m, cnst3_m, in5, in2, in6, in1); \ - BUTTERFLY_4(in7, in0, in2, in5, s1_m, s0_m, in2, in5); \ - out7 = -s0_m; \ - out0 = s1_m; \ - \ - SPLATI_H4_SH(coeff1_m, 0, 4, 1, 5, cnst0_m, cnst1_m, cnst2_m, cnst3_m); \ - \ - ILVEV_H2_SH(cnst3_m, cnst0_m, cnst1_m, cnst2_m, cnst3_m, cnst2_m); \ - cnst0_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - cnst1_m = cnst0_m; \ - \ - ILVRL_H2_SH(in4, in3, vec1_m, vec0_m); \ - ILVRL_H2_SH(in6, in1, vec3_m, vec2_m); \ - DOT_ADD_SUB_SRARI_PCK(vec0_m, vec1_m, vec2_m, vec3_m, cnst0_m, cnst2_m, \ - cnst3_m, cnst1_m, out1, out6, s0_m, s1_m); \ - \ - SPLATI_H2_SH(coeff1_m, 2, 3, cnst0_m, cnst1_m); \ - cnst1_m = __msa_ilvev_h(cnst1_m, cnst0_m); \ - \ - ILVRL_H2_SH(in2, in5, vec1_m, vec0_m); \ - ILVRL_H2_SH(s0_m, s1_m, vec3_m, vec2_m); \ - out3 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m); \ - out4 = DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m); \ - out2 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m); \ - out5 = DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m); \ - \ - out1 = -out1; \ - out3 = -out3; \ - out5 = -out5; \ - } - -#define AOM_SET_COSPI_PAIR(c0_h, c1_h) \ - ({ \ - v8i16 out0_m, r0_m, r1_m; \ - \ - r0_m = __msa_fill_h(c0_h); \ - r1_m = __msa_fill_h(c1_h); \ - out0_m = __msa_ilvev_h(r1_m, r0_m); \ - \ - out0_m; \ - }) - -#define AOM_ADDBLK_ST8x4_UB(dst, dst_stride, in0, in1, in2, in3) \ - { \ - uint8_t *dst_m = (uint8_t *)(dst); \ - v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \ - v16i8 tmp0_m, tmp1_m; \ - v16i8 zero_m = { 0 }; \ - v8i16 res0_m, res1_m, res2_m, res3_m; \ - \ - LD_UB4(dst_m, dst_stride, dst0_m, dst1_m, dst2_m, dst3_m); \ - ILVR_B4_SH(zero_m, dst0_m, zero_m, dst1_m, zero_m, dst2_m, zero_m, dst3_m, \ - res0_m, res1_m, res2_m, res3_m); \ - ADD4(res0_m, in0, res1_m, in1, res2_m, in2, res3_m, in3, res0_m, res1_m, \ - res2_m, res3_m); \ - CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \ - PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \ - } - -#define AOM_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v8i16 c0_m, c1_m, c2_m, c3_m; \ - v8i16 step0_m, step1_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - \ - c0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ - c1_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ - step0_m = __msa_ilvr_h(in2, in0); \ - DOTP_SH2_SW(step0_m, step0_m, c0_m, c1_m, tmp0_m, tmp1_m); \ - \ - c2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ - c3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ - step1_m = __msa_ilvr_h(in3, in1); \ - DOTP_SH2_SW(step1_m, step1_m, c2_m, c3_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - \ - PCKEV_H2_SW(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp0_m, tmp2_m); \ - SLDI_B2_0_SW(tmp0_m, tmp2_m, tmp1_m, tmp3_m, 8); \ - BUTTERFLY_4((v8i16)tmp0_m, (v8i16)tmp1_m, (v8i16)tmp2_m, (v8i16)tmp3_m, \ - out0, out1, out2, out3); \ - } - -#define AOM_IADST4x4(in0, in1, in2, in3, out0, out1, out2, out3) \ - { \ - v8i16 res0_m, res1_m, c0_m, c1_m; \ - v8i16 k1_m, k2_m, k3_m, k4_m; \ - v8i16 zero_m = { 0 }; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v4i32 int0_m, int1_m, int2_m, int3_m; \ - v8i16 mask_m = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9, \ - -sinpi_1_9, -sinpi_2_9, -sinpi_3_9, -sinpi_4_9 }; \ - \ - SPLATI_H4_SH(mask_m, 3, 0, 1, 2, c0_m, c1_m, k1_m, k2_m); \ - ILVEV_H2_SH(c0_m, c1_m, k1_m, k2_m, c0_m, c1_m); \ - ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ - DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp2_m, tmp1_m); \ - int0_m = tmp2_m + tmp1_m; \ - \ - SPLATI_H2_SH(mask_m, 4, 7, k4_m, k3_m); \ - ILVEV_H2_SH(k4_m, k1_m, k3_m, k2_m, c0_m, c1_m); \ - DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ - int1_m = tmp0_m + tmp1_m; \ - \ - c0_m = __msa_splati_h(mask_m, 6); \ - ILVL_H2_SH(k2_m, c0_m, zero_m, k2_m, c0_m, c1_m); \ - ILVR_H2_SH(in0, in2, in1, in3, res0_m, res1_m); \ - DOTP_SH2_SW(res0_m, res1_m, c0_m, c1_m, tmp0_m, tmp1_m); \ - int2_m = tmp0_m + tmp1_m; \ - \ - c0_m = __msa_splati_h(mask_m, 6); \ - c0_m = __msa_ilvev_h(c0_m, k1_m); \ - \ - res0_m = __msa_ilvr_h((in1), (in3)); \ - tmp0_m = __msa_dotp_s_w(res0_m, c0_m); \ - int3_m = tmp2_m + tmp0_m; \ - \ - res0_m = __msa_ilvr_h((in2), (in3)); \ - c1_m = __msa_ilvev_h(k4_m, k3_m); \ - \ - tmp2_m = __msa_dotp_s_w(res0_m, c1_m); \ - res1_m = __msa_ilvr_h((in0), (in2)); \ - c1_m = __msa_ilvev_h(k1_m, zero_m); \ - \ - tmp3_m = __msa_dotp_s_w(res1_m, c1_m); \ - int3_m += tmp2_m; \ - int3_m += tmp3_m; \ - \ - SRARI_W4_SW(int0_m, int1_m, int2_m, int3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(int0_m, int0_m, int1_m, int1_m, out0, out1); \ - PCKEV_H2_SH(int2_m, int2_m, int3_m, int3_m, out2, out3); \ - } - -#define AV1_SET_CONST_PAIR(mask_h, idx1_h, idx2_h) \ - ({ \ - v8i16 c0_m, c1_m; \ - \ - SPLATI_H2_SH(mask_h, idx1_h, idx2_h, c0_m, c1_m); \ - c0_m = __msa_ilvev_h(c1_m, c0_m); \ - \ - c0_m; \ - }) - -/* multiply and add macro */ -#define AV1_MADD(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ - out2, out3) \ - { \ - v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ - v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \ - \ - ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ - ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ - DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ - cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ - SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \ - DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ - cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ - SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \ - } - -/* idct 8x8 macro */ -#define AOM_IDCT8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - v8i16 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m; \ - v8i16 k0_m, k1_m, k2_m, k3_m, res0_m, res1_m, res2_m, res3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - v8i16 mask_m = { cospi_28_64, cospi_4_64, cospi_20_64, cospi_12_64, \ - cospi_16_64, -cospi_4_64, -cospi_20_64, -cospi_16_64 }; \ - \ - k0_m = AV1_SET_CONST_PAIR(mask_m, 0, 5); \ - k1_m = AV1_SET_CONST_PAIR(mask_m, 1, 0); \ - k2_m = AV1_SET_CONST_PAIR(mask_m, 6, 3); \ - k3_m = AV1_SET_CONST_PAIR(mask_m, 3, 2); \ - AV1_MADD(in1, in7, in3, in5, k0_m, k1_m, k2_m, k3_m, in1, in7, in3, in5); \ - SUB2(in1, in3, in7, in5, res0_m, res1_m); \ - k0_m = AV1_SET_CONST_PAIR(mask_m, 4, 7); \ - k1_m = __msa_splati_h(mask_m, 4); \ - \ - ILVRL_H2_SH(res0_m, res1_m, res2_m, res3_m); \ - DOTP_SH4_SW(res2_m, res3_m, res2_m, res3_m, k0_m, k0_m, k1_m, k1_m, \ - tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - tp4_m = in1 + in3; \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, tp5_m, tp6_m); \ - tp7_m = in7 + in5; \ - k2_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ - k3_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ - AV1_MADD(in0, in4, in2, in6, k1_m, k0_m, k2_m, k3_m, in0, in4, in2, in6); \ - BUTTERFLY_4(in0, in4, in2, in6, tp0_m, tp1_m, tp2_m, tp3_m); \ - BUTTERFLY_8(tp0_m, tp1_m, tp2_m, tp3_m, tp4_m, tp5_m, tp6_m, tp7_m, out0, \ - out1, out2, out3, out4, out5, out6, out7); \ - } - -#define AV1_IADST8x8_1D(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - v4i32 r0_m, r1_m, r2_m, r3_m, r4_m, r5_m, r6_m, r7_m; \ - v4i32 m0_m, m1_m, m2_m, m3_m, t0_m, t1_m; \ - v8i16 res0_m, res1_m, res2_m, res3_m, k0_m, k1_m, in_s0, in_s1; \ - v8i16 mask1_m = { cospi_2_64, cospi_30_64, -cospi_2_64, cospi_10_64, \ - cospi_22_64, -cospi_10_64, cospi_18_64, cospi_14_64 }; \ - v8i16 mask2_m = { cospi_14_64, -cospi_18_64, cospi_26_64, cospi_6_64, \ - -cospi_26_64, cospi_8_64, cospi_24_64, -cospi_8_64 }; \ - v8i16 mask3_m = { \ - -cospi_24_64, cospi_8_64, cospi_16_64, -cospi_16_64, 0, 0, 0, 0 \ - }; \ - \ - k0_m = AV1_SET_CONST_PAIR(mask1_m, 0, 1); \ - k1_m = AV1_SET_CONST_PAIR(mask1_m, 1, 2); \ - ILVRL_H2_SH(in1, in0, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ - r1_m, r2_m, r3_m); \ - k0_m = AV1_SET_CONST_PAIR(mask1_m, 6, 7); \ - k1_m = AV1_SET_CONST_PAIR(mask2_m, 0, 1); \ - ILVRL_H2_SH(in5, in4, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ - r5_m, r6_m, r7_m); \ - ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res0_m, res1_m); \ - SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, t0_m, t1_m); \ - k0_m = AV1_SET_CONST_PAIR(mask1_m, 3, 4); \ - k1_m = AV1_SET_CONST_PAIR(mask1_m, 4, 5); \ - ILVRL_H2_SH(in3, in2, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ - r1_m, r2_m, r3_m); \ - k0_m = AV1_SET_CONST_PAIR(mask2_m, 2, 3); \ - k1_m = AV1_SET_CONST_PAIR(mask2_m, 3, 4); \ - ILVRL_H2_SH(in7, in6, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r4_m, \ - r5_m, r6_m, r7_m); \ - ADD4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, res2_m, res3_m); \ - SUB4(r0_m, r4_m, r1_m, r5_m, r2_m, r6_m, r3_m, r7_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SW(m1_m, m0_m, m3_m, m2_m, r2_m, r3_m); \ - ILVRL_H2_SW(r3_m, r2_m, m2_m, m3_m); \ - BUTTERFLY_4(res0_m, res1_m, res3_m, res2_m, out0, in7, in4, in3); \ - k0_m = AV1_SET_CONST_PAIR(mask2_m, 5, 6); \ - k1_m = AV1_SET_CONST_PAIR(mask2_m, 6, 7); \ - ILVRL_H2_SH(t1_m, t0_m, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, r0_m, \ - r1_m, r2_m, r3_m); \ - k1_m = AV1_SET_CONST_PAIR(mask3_m, 0, 1); \ - DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, r4_m, r5_m, \ - r6_m, r7_m); \ - ADD4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in1, out6); \ - SUB4(r0_m, r6_m, r1_m, r7_m, r2_m, r4_m, r3_m, r5_m, m0_m, m1_m, m2_m, \ - m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in2, in5); \ - k0_m = AV1_SET_CONST_PAIR(mask3_m, 2, 2); \ - k1_m = AV1_SET_CONST_PAIR(mask3_m, 2, 3); \ - ILVRL_H2_SH(in4, in3, in_s1, in_s0); \ - DOTP_SH4_SW(in_s1, in_s0, in_s1, in_s0, k0_m, k0_m, k1_m, k1_m, m0_m, \ - m1_m, m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, in3, out4); \ - ILVRL_H2_SW(in5, in2, m2_m, m3_m); \ - DOTP_SH4_SW(m2_m, m3_m, m2_m, m3_m, k0_m, k0_m, k1_m, k1_m, m0_m, m1_m, \ - m2_m, m3_m); \ - SRARI_W4_SW(m0_m, m1_m, m2_m, m3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m1_m, m0_m, m3_m, m2_m, out2, in5); \ - \ - out1 = -in1; \ - out3 = -in3; \ - out5 = -in5; \ - out7 = -in7; \ - } - -#define AOM_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, \ - r12, r13, r14, r15, out0, out1, out2, out3, out4, \ - out5, out6, out7, out8, out9, out10, out11, out12, \ - out13, out14, out15) \ - { \ - v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m; \ - v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m; \ - v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m; \ - v8i16 h8_m, h9_m, h10_m, h11_m; \ - v8i16 k0_m, k1_m, k2_m, k3_m; \ - \ - /* stage 1 */ \ - k0_m = AOM_SET_COSPI_PAIR(cospi_1_64, cospi_31_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_31_64, -cospi_1_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_17_64, cospi_15_64); \ - k3_m = AOM_SET_COSPI_PAIR(cospi_15_64, -cospi_17_64); \ - MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m, g0_m, g1_m, g2_m, g3_m); \ - k0_m = AOM_SET_COSPI_PAIR(cospi_5_64, cospi_27_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_27_64, -cospi_5_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_21_64, cospi_11_64); \ - k3_m = AOM_SET_COSPI_PAIR(cospi_11_64, -cospi_21_64); \ - MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m, g4_m, g5_m, g6_m, g7_m); \ - k0_m = AOM_SET_COSPI_PAIR(cospi_9_64, cospi_23_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_23_64, -cospi_9_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_25_64, cospi_7_64); \ - k3_m = AOM_SET_COSPI_PAIR(cospi_7_64, -cospi_25_64); \ - MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m, g8_m, g9_m, g10_m, \ - g11_m); \ - k0_m = AOM_SET_COSPI_PAIR(cospi_13_64, cospi_19_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_19_64, -cospi_13_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_29_64, cospi_3_64); \ - k3_m = AOM_SET_COSPI_PAIR(cospi_3_64, -cospi_29_64); \ - MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m, g12_m, g13_m, g14_m, \ - g15_m); \ - \ - /* stage 2 */ \ - k0_m = AOM_SET_COSPI_PAIR(cospi_4_64, cospi_28_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_28_64, -cospi_4_64); \ - k2_m = AOM_SET_COSPI_PAIR(-cospi_28_64, cospi_4_64); \ - MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m, h0_m, h1_m, h2_m, \ - h3_m); \ - k0_m = AOM_SET_COSPI_PAIR(cospi_12_64, cospi_20_64); \ - k1_m = AOM_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_20_64, -cospi_12_64); \ - MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m, h4_m, h5_m, \ - h6_m, h7_m); \ - BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10); \ - BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m, h8_m, h9_m, \ - h10_m, h11_m, h6_m, h4_m, h2_m, h0_m); \ - \ - /* stage 3 */ \ - BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m); \ - k0_m = AOM_SET_COSPI_PAIR(cospi_8_64, cospi_24_64); \ - k1_m = AOM_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64); \ - k2_m = AOM_SET_COSPI_PAIR(-cospi_24_64, cospi_8_64); \ - MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m, out4, out6, out5, \ - out7); \ - MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m, out12, out14, \ - out13, out15); \ - \ - /* stage 4 */ \ - k0_m = AOM_SET_COSPI_PAIR(cospi_16_64, cospi_16_64); \ - k1_m = AOM_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64); \ - k2_m = AOM_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64); \ - k3_m = AOM_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64); \ - MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3); \ - MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7); \ - MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11); \ - MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15); \ - } - -void aom_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, - int32_t dst_stride); -void aom_idct16_1d_rows_msa(const int16_t *input, int16_t *output); -void aom_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst, - int32_t dst_stride); -void aom_iadst16_1d_rows_msa(const int16_t *input, int16_t *output); -#endif // AOM_DSP_MIPS_INV_TXFM_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c b/third_party/aom/aom_dsp/mips/itrans16_dspr2.c deleted file mode 100644 index c63b1e857..000000000 --- a/third_party/aom/aom_dsp/mips/itrans16_dspr2.c +++ /dev/null @@ -1,1190 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/inv_txfm_dspr2.h" -#include "aom_dsp/txfm_common.h" - -#if HAVE_DSPR2 -void idct16_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { - int i; - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - int step1_10, step1_11, step1_12, step1_13; - int step2_0, step2_1, step2_2, step2_3; - int step2_8, step2_9, step2_10, step2_11; - int step2_12, step2_13, step2_14, step2_15; - int load1, load2, load3, load4, load5, load6, load7, load8; - int result1, result2, result3, result4; - const int const_2_power_13 = 8192; - - for (i = no_rows; i--;) { - /* prefetch row */ - prefetch_load((const uint8_t *)(input + 16)); - - __asm__ __volatile__( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 16(%[input]) \n\t" - "lh %[load3], 8(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[step2_0], $ac1, 31 \n\t" - "extp %[step2_1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[step2_2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[step2_3], $ac1, 31 \n\t" - - "add %[step1_0], %[step2_0], %[step2_3] \n\t" - "add %[step1_1], %[step2_1], %[step2_2] \n\t" - "sub %[step1_2], %[step2_1], %[step2_2] \n\t" - "sub %[step1_3], %[step2_0], %[step2_3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), - [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), - [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "lh %[load5], 2(%[input]) \n\t" - "lh %[load6], 30(%[input]) \n\t" - "lh %[load7], 18(%[input]) \n\t" - "lh %[load8], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_30_64] \n\t" - "msub $ac1, %[load6], %[cospi_2_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_14_64] \n\t" - "msub $ac3, %[load8], %[cospi_18_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_18_64] \n\t" - "madd $ac1, %[load8], %[cospi_14_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_2_64] \n\t" - "madd $ac2, %[load6], %[cospi_30_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "sub %[load5], %[result1], %[result2] \n\t" - "sub %[load6], %[result4], %[result3] \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load6], %[cospi_24_64] \n\t" - "msub $ac1, %[load5], %[cospi_8_64] \n\t" - "madd $ac3, %[load5], %[cospi_24_64] \n\t" - "madd $ac3, %[load6], %[cospi_8_64] \n\t" - - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[result1], %[result2] \n\t" - "add %[step2_15], %[result4], %[result3] \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), - [load8] "=&r"(load8), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step2_8] "=r"(step2_8), - [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), - [step2_14] "=r"(step2_14) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), - [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - - __asm__ __volatile__( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 22(%[input]) \n\t" - "lh %[load3], 26(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load3], %[cospi_6_64] \n\t" - "msub $ac3, %[load4], %[cospi_26_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load1], %[cospi_10_64] \n\t" - "madd $ac1, %[load2], %[cospi_22_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load3], %[cospi_26_64] \n\t" - "madd $ac2, %[load4], %[cospi_6_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[result2], %[result1] \n\t" - "sub %[load2], %[result4], %[result3] \n\t" - - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[result1], %[result2] \n\t" - "add %[step2_12], %[result4], %[result3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), - [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - - __asm__ __volatile__( - "lh %[load5], 4(%[input]) \n\t" - "lh %[load6], 28(%[input]) \n\t" - "lh %[load7], 20(%[input]) \n\t" - "lh %[load8], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_28_64] \n\t" - "msub $ac1, %[load6], %[cospi_4_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_12_64] \n\t" - "msub $ac3, %[load8], %[cospi_20_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_20_64] \n\t" - "madd $ac1, %[load8], %[cospi_12_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_4_64] \n\t" - "madd $ac2, %[load6], %[cospi_28_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[result4], %[result3] \n\t" - "sub %[load5], %[load5], %[result1] \n\t" - "add %[load5], %[load5], %[result2] \n\t" - - "sub %[load6], %[result1], %[result2] \n\t" - "sub %[load6], %[load6], %[result3] \n\t" - "add %[load6], %[load6], %[result4] \n\t" - - "madd $ac1, %[load5], %[cospi_16_64] \n\t" - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - "add %[step1_4], %[result1], %[result2] \n\t" - "add %[step1_7], %[result4], %[result3] \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), - [load8] "=&r"(load8), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "sub %[load5], %[step2_14], %[step2_13] \n\t" - "sub %[load5], %[load5], %[step2_9] \n\t" - "add %[load5], %[load5], %[step2_10] \n\t" - - "madd $ac0, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_14], %[step2_13] \n\t" - "sub %[load6], %[load6], %[step2_10] \n\t" - "add %[load6], %[load6], %[step2_9] \n\t" - - "madd $ac1, %[load6], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[step2_15], %[step2_12] \n\t" - "sub %[load5], %[load5], %[step2_8] \n\t" - "add %[load5], %[load5], %[step2_11] \n\t" - - "madd $ac2, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_15], %[step2_12] \n\t" - "sub %[load6], %[load6], %[step2_11] \n\t" - "add %[load6], %[load6], %[step2_8] \n\t" - - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_10], $ac0, 31 \n\t" - "extp %[step1_13], $ac1, 31 \n\t" - "extp %[step1_11], $ac2, 31 \n\t" - "extp %[step1_12], $ac3, 31 \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), - [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), - [step1_13] "=r"(step1_13) - : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), - [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), - [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), - [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), - [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "add %[load5], %[step1_0], %[step1_7] \n\t" - "add %[load5], %[load5], %[step2_12] \n\t" - "add %[load5], %[load5], %[step2_15] \n\t" - "add %[load6], %[step1_1], %[step1_6] \n\t" - "add %[load6], %[load6], %[step2_13] \n\t" - "add %[load6], %[load6], %[step2_14] \n\t" - "sh %[load5], 0(%[output]) \n\t" - "sh %[load6], 32(%[output]) \n\t" - "sub %[load5], %[step1_1], %[step1_6] \n\t" - "add %[load5], %[load5], %[step2_9] \n\t" - "add %[load5], %[load5], %[step2_10] \n\t" - "sub %[load6], %[step1_0], %[step1_7] \n\t" - "add %[load6], %[load6], %[step2_8] \n\t" - "add %[load6], %[load6], %[step2_11] \n\t" - "sh %[load5], 192(%[output]) \n\t" - "sh %[load6], 224(%[output]) \n\t" - "sub %[load5], %[step1_0], %[step1_7] \n\t" - "sub %[load5], %[load5], %[step2_8] \n\t" - "sub %[load5], %[load5], %[step2_11] \n\t" - "sub %[load6], %[step1_1], %[step1_6] \n\t" - "sub %[load6], %[load6], %[step2_9] \n\t" - "sub %[load6], %[load6], %[step2_10] \n\t" - "sh %[load5], 256(%[output]) \n\t" - "sh %[load6], 288(%[output]) \n\t" - "add %[load5], %[step1_1], %[step1_6] \n\t" - "sub %[load5], %[load5], %[step2_13] \n\t" - "sub %[load5], %[load5], %[step2_14] \n\t" - "add %[load6], %[step1_0], %[step1_7] \n\t" - "sub %[load6], %[load6], %[step2_12] \n\t" - "sub %[load6], %[load6], %[step2_15] \n\t" - "sh %[load5], 448(%[output]) \n\t" - "sh %[load6], 480(%[output]) \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6) - : [output] "r"(output), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), - [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), - [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), - [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), - [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), - [step2_14] "r"(step2_14), [step2_15] "r"(step2_15)); - - __asm__ __volatile__( - "add %[load5], %[step1_2], %[step1_5] \n\t" - "add %[load5], %[load5], %[step1_13] \n\t" - "add %[load6], %[step1_3], %[step1_4] \n\t" - "add %[load6], %[load6], %[step1_12] \n\t" - "sh %[load5], 64(%[output]) \n\t" - "sh %[load6], 96(%[output]) \n\t" - "sub %[load5], %[step1_3], %[step1_4] \n\t" - "add %[load5], %[load5], %[step1_11] \n\t" - "sub %[load6], %[step1_2], %[step1_5] \n\t" - "add %[load6], %[load6], %[step1_10] \n\t" - "sh %[load5], 128(%[output]) \n\t" - "sh %[load6], 160(%[output]) \n\t" - "sub %[load5], %[step1_2], %[step1_5] \n\t" - "sub %[load5], %[load5], %[step1_10] \n\t" - "sub %[load6], %[step1_3], %[step1_4] \n\t" - "sub %[load6], %[load6], %[step1_11] \n\t" - "sh %[load5], 320(%[output]) \n\t" - "sh %[load6], 352(%[output]) \n\t" - "add %[load5], %[step1_3], %[step1_4] \n\t" - "sub %[load5], %[load5], %[step1_12] \n\t" - "add %[load6], %[step1_2], %[step1_5] \n\t" - "sub %[load6], %[load6], %[step1_13] \n\t" - "sh %[load5], 384(%[output]) \n\t" - "sh %[load6], 416(%[output]) \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6) - : [output] "r"(output), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), - [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), - [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), - [step1_12] "r"(step1_12), [step1_13] "r"(step1_13)); - - input += 16; - output += 1; - } -} - -void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { - int i; - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - int step1_8, step1_9, step1_10, step1_11; - int step1_12, step1_13, step1_14, step1_15; - int step2_0, step2_1, step2_2, step2_3; - int step2_8, step2_9, step2_10, step2_11; - int step2_12, step2_13, step2_14, step2_15; - int load1, load2, load3, load4, load5, load6, load7, load8; - int result1, result2, result3, result4; - const int const_2_power_13 = 8192; - uint8_t *dest_pix; - uint8_t *cm = aom_ff_cropTbl; - - /* prefetch aom_ff_cropTbl */ - prefetch_load(aom_ff_cropTbl); - prefetch_load(aom_ff_cropTbl + 32); - prefetch_load(aom_ff_cropTbl + 64); - prefetch_load(aom_ff_cropTbl + 96); - prefetch_load(aom_ff_cropTbl + 128); - prefetch_load(aom_ff_cropTbl + 160); - prefetch_load(aom_ff_cropTbl + 192); - prefetch_load(aom_ff_cropTbl + 224); - - for (i = 0; i < 16; ++i) { - dest_pix = (dest + i); - __asm__ __volatile__( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 16(%[input]) \n\t" - "lh %[load3], 8(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[step2_0], $ac1, 31 \n\t" - "extp %[step2_1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[step2_2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[step2_3], $ac1, 31 \n\t" - - "add %[step1_0], %[step2_0], %[step2_3] \n\t" - "add %[step1_1], %[step2_1], %[step2_2] \n\t" - "sub %[step1_2], %[step2_1], %[step2_2] \n\t" - "sub %[step1_3], %[step2_0], %[step2_3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [step2_0] "=&r"(step2_0), - [step2_1] "=&r"(step2_1), [step2_2] "=&r"(step2_2), - [step2_3] "=&r"(step2_3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "lh %[load5], 2(%[input]) \n\t" - "lh %[load6], 30(%[input]) \n\t" - "lh %[load7], 18(%[input]) \n\t" - "lh %[load8], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_30_64] \n\t" - "msub $ac1, %[load6], %[cospi_2_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_14_64] \n\t" - "msub $ac3, %[load8], %[cospi_18_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_18_64] \n\t" - "madd $ac1, %[load8], %[cospi_14_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_2_64] \n\t" - "madd $ac2, %[load6], %[cospi_30_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "sub %[load5], %[result1], %[result2] \n\t" - "sub %[load6], %[result4], %[result3] \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load6], %[cospi_24_64] \n\t" - "msub $ac1, %[load5], %[cospi_8_64] \n\t" - "madd $ac3, %[load5], %[cospi_24_64] \n\t" - "madd $ac3, %[load6], %[cospi_8_64] \n\t" - - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[result1], %[result2] \n\t" - "add %[step2_15], %[result4], %[result3] \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), - [load8] "=&r"(load8), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step2_8] "=r"(step2_8), - [step2_15] "=r"(step2_15), [step2_9] "=r"(step2_9), - [step2_14] "=r"(step2_14) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), - [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - - __asm__ __volatile__( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 22(%[input]) \n\t" - "lh %[load3], 26(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load3], %[cospi_6_64] \n\t" - "msub $ac3, %[load4], %[cospi_26_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load1], %[cospi_10_64] \n\t" - "madd $ac1, %[load2], %[cospi_22_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load3], %[cospi_26_64] \n\t" - "madd $ac2, %[load4], %[cospi_6_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[result2], %[result1] \n\t" - "sub %[load2], %[result4], %[result3] \n\t" - - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[result1], %[result2] \n\t" - "add %[step2_12], %[result4], %[result3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), - [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - - __asm__ __volatile__( - "lh %[load5], 4(%[input]) \n\t" - "lh %[load6], 28(%[input]) \n\t" - "lh %[load7], 20(%[input]) \n\t" - "lh %[load8], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load5], %[cospi_28_64] \n\t" - "msub $ac1, %[load6], %[cospi_4_64] \n\t" - "extp %[result1], $ac1, 31 \n\t" - - "madd $ac3, %[load7], %[cospi_12_64] \n\t" - "msub $ac3, %[load8], %[cospi_20_64] \n\t" - "extp %[result2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac1, %[load7], %[cospi_20_64] \n\t" - "madd $ac1, %[load8], %[cospi_12_64] \n\t" - "extp %[result3], $ac1, 31 \n\t" - - "madd $ac2, %[load5], %[cospi_4_64] \n\t" - "madd $ac2, %[load6], %[cospi_28_64] \n\t" - "extp %[result4], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[result4], %[result3] \n\t" - "sub %[load5], %[load5], %[result1] \n\t" - "add %[load5], %[load5], %[result2] \n\t" - - "sub %[load6], %[result1], %[result2] \n\t" - "sub %[load6], %[load6], %[result3] \n\t" - "add %[load6], %[load6], %[result4] \n\t" - - "madd $ac1, %[load5], %[cospi_16_64] \n\t" - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - - "add %[step1_4], %[result1], %[result2] \n\t" - "add %[step1_7], %[result4], %[result3] \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), - [load8] "=&r"(load8), [result1] "=&r"(result1), - [result2] "=&r"(result2), [result3] "=&r"(result3), - [result4] "=&r"(result4), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "sub %[load5], %[step2_14], %[step2_13] \n\t" - "sub %[load5], %[load5], %[step2_9] \n\t" - "add %[load5], %[load5], %[step2_10] \n\t" - - "madd $ac0, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_14], %[step2_13] \n\t" - "sub %[load6], %[load6], %[step2_10] \n\t" - "add %[load6], %[load6], %[step2_9] \n\t" - - "madd $ac1, %[load6], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load5], %[step2_15], %[step2_12] \n\t" - "sub %[load5], %[load5], %[step2_8] \n\t" - "add %[load5], %[load5], %[step2_11] \n\t" - - "madd $ac2, %[load5], %[cospi_16_64] \n\t" - - "sub %[load6], %[step2_15], %[step2_12] \n\t" - "sub %[load6], %[load6], %[step2_11] \n\t" - "add %[load6], %[load6], %[step2_8] \n\t" - - "madd $ac3, %[load6], %[cospi_16_64] \n\t" - - "extp %[step1_10], $ac0, 31 \n\t" - "extp %[step1_13], $ac1, 31 \n\t" - "extp %[step1_11], $ac2, 31 \n\t" - "extp %[step1_12], $ac3, 31 \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [step1_10] "=r"(step1_10), - [step1_11] "=r"(step1_11), [step1_12] "=r"(step1_12), - [step1_13] "=r"(step1_13) - : [const_2_power_13] "r"(const_2_power_13), [step2_14] "r"(step2_14), - [step2_13] "r"(step2_13), [step2_9] "r"(step2_9), - [step2_10] "r"(step2_10), [step2_15] "r"(step2_15), - [step2_12] "r"(step2_12), [step2_8] "r"(step2_8), - [step2_11] "r"(step2_11), [cospi_16_64] "r"(cospi_16_64)); - - step1_8 = step2_8 + step2_11; - step1_9 = step2_9 + step2_10; - step1_14 = step2_13 + step2_14; - step1_15 = step2_12 + step2_15; - - __asm__ __volatile__( - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_0], %[step1_7] \n\t" - "add %[load5], %[load5], %[step1_15] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_1], %[step1_6] \n\t" - "add %[load6], %[load6], %[step1_14] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_2], %[step1_5] \n\t" - "add %[load5], %[load5], %[step1_13] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_3], %[step1_4] \n\t" - "add %[load6], %[load6], %[step1_12] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "sub %[load5], %[step1_3], %[step1_4] \n\t" - "add %[load5], %[load5], %[step1_11] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_2], %[step1_5] \n\t" - "add %[load6], %[load6], %[step1_10] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "sub %[load5], %[step1_1], %[step1_6] \n\t" - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[load5], %[step1_9] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_0], %[step1_7] \n\t" - "add %[load6], %[load6], %[step1_8] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "sub %[load5], %[step1_0], %[step1_7] \n\t" - "sub %[load5], %[load5], %[step1_8] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_1], %[step1_6] \n\t" - "sub %[load6], %[load6], %[step1_9] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "sub %[load5], %[step1_2], %[step1_5] \n\t" - "sub %[load5], %[load5], %[step1_10] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "sub %[load6], %[step1_3], %[step1_4] \n\t" - "sub %[load6], %[load6], %[step1_11] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_3], %[step1_4] \n\t" - "sub %[load5], %[load5], %[step1_12] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_2], %[step1_5] \n\t" - "sub %[load6], %[load6], %[step1_13] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[load7], 0(%[dest_pix]) \n\t" - "add %[load5], %[step1_1], %[step1_6] \n\t" - "sub %[load5], %[load5], %[step1_14] \n\t" - "addi %[load5], %[load5], 32 \n\t" - "sra %[load5], %[load5], 6 \n\t" - "add %[load7], %[load7], %[load5] \n\t" - "lbux %[load5], %[load7](%[cm]) \n\t" - "add %[load6], %[step1_0], %[step1_7] \n\t" - "sub %[load6], %[load6], %[step1_15] \n\t" - "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[load8], 0(%[dest_pix]) \n\t" - "addi %[load6], %[load6], 32 \n\t" - "sra %[load6], %[load6], 6 \n\t" - "add %[load8], %[load8], %[load6] \n\t" - "lbux %[load6], %[load8](%[cm]) \n\t" - "sb %[load6], 0(%[dest_pix]) \n\t" - - : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), - [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix) - : - [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), - [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), - [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), - [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), - [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), - [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), - [step1_14] "r"(step1_14), [step1_15] "r"(step1_15)); - - input += 16; - } -} - -void aom_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[16 * 16]); - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); - - // First transform rows - idct16_rows_dspr2(input, out, 16); - - // Then transform columns and add to dest - idct16_cols_add_blk_dspr2(out, dest, dest_stride); -} - -void aom_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[16 * 16]); - int16_t *outptr = out; - uint32_t i; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); - - // First transform rows. Since all non-zero dct coefficients are in - // upper-left 4x4 area, we only need to calculate first 4 rows here. - idct16_rows_dspr2(input, outptr, 4); - - outptr += 4; - for (i = 0; i < 6; ++i) { - __asm__ __volatile__( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 64(%[outptr]) \n\t" - "sw $zero, 96(%[outptr]) \n\t" - "sw $zero, 128(%[outptr]) \n\t" - "sw $zero, 160(%[outptr]) \n\t" - "sw $zero, 192(%[outptr]) \n\t" - "sw $zero, 224(%[outptr]) \n\t" - "sw $zero, 256(%[outptr]) \n\t" - "sw $zero, 288(%[outptr]) \n\t" - "sw $zero, 320(%[outptr]) \n\t" - "sw $zero, 352(%[outptr]) \n\t" - "sw $zero, 384(%[outptr]) \n\t" - "sw $zero, 416(%[outptr]) \n\t" - "sw $zero, 448(%[outptr]) \n\t" - "sw $zero, 480(%[outptr]) \n\t" - - : - : [outptr] "r"(outptr)); - - outptr += 2; - } - - // Then transform columns - idct16_cols_add_blk_dspr2(out, dest, dest_stride); -} - -void aom_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - uint32_t pos = 45; - int32_t out; - int32_t r; - int32_t a1, absa1; - int32_t vector_a1; - int32_t t1, t2, t3, t4; - int32_t vector_1, vector_2, vector_3, vector_4; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - - : - : [pos] "r"(pos)); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__( - "addi %[out], %[out], 32 \n\t" - "sra %[a1], %[out], 6 \n\t" - - : [out] "+r"(out), [a1] "=r"(a1) - :); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 16; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), - [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), - [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), - [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - - : [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 16; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), - [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), - [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), - [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } -} - -void iadst16_dspr2(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; - - int x0 = input[15]; - int x1 = input[0]; - int x2 = input[13]; - int x3 = input[2]; - int x4 = input[11]; - int x5 = input[4]; - int x6 = input[9]; - int x7 = input[6]; - int x8 = input[7]; - int x9 = input[8]; - int x10 = input[5]; - int x11 = input[10]; - int x12 = input[3]; - int x13 = input[12]; - int x14 = input[1]; - int x15 = input[14]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = output[8] = output[9] = output[10] = - output[11] = output[12] = output[13] = output[14] = output[15] = 0; - return; - } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = dct_const_round_shift(s0 + s8); - x1 = dct_const_round_shift(s1 + s9); - x2 = dct_const_round_shift(s2 + s10); - x3 = dct_const_round_shift(s3 + s11); - x4 = dct_const_round_shift(s4 + s12); - x5 = dct_const_round_shift(s5 + s13); - x6 = dct_const_round_shift(s6 + s14); - x7 = dct_const_round_shift(s7 + s15); - x8 = dct_const_round_shift(s0 - s8); - x9 = dct_const_round_shift(s1 - s9); - x10 = dct_const_round_shift(s2 - s10); - x11 = dct_const_round_shift(s3 - s11); - x12 = dct_const_round_shift(s4 - s12); - x13 = dct_const_round_shift(s5 - s13); - x14 = dct_const_round_shift(s6 - s14); - x15 = dct_const_round_shift(s7 - s15); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = s0 + s4; - x1 = s1 + s5; - x2 = s2 + s6; - x3 = s3 + s7; - x4 = s0 - s4; - x5 = s1 - s5; - x6 = s2 - s6; - x7 = s3 - s7; - x8 = dct_const_round_shift(s8 + s12); - x9 = dct_const_round_shift(s9 + s13); - x10 = dct_const_round_shift(s10 + s14); - x11 = dct_const_round_shift(s11 + s15); - x12 = dct_const_round_shift(s8 - s12); - x13 = dct_const_round_shift(s9 - s13); - x14 = dct_const_round_shift(s10 - s14); - x15 = dct_const_round_shift(s11 - s15); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = s0 + s2; - x1 = s1 + s3; - x2 = s0 - s2; - x3 = s1 - s3; - x4 = dct_const_round_shift(s4 + s6); - x5 = dct_const_round_shift(s5 + s7); - x6 = dct_const_round_shift(s4 - s6); - x7 = dct_const_round_shift(s5 - s7); - x8 = s8 + s10; - x9 = s9 + s11; - x10 = s8 - s10; - x11 = s9 - s11; - x12 = dct_const_round_shift(s12 + s14); - x13 = dct_const_round_shift(s13 + s15); - x14 = dct_const_round_shift(s12 - s14); - x15 = dct_const_round_shift(s13 - s15); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = dct_const_round_shift(s2); - x3 = dct_const_round_shift(s3); - x6 = dct_const_round_shift(s6); - x7 = dct_const_round_shift(s7); - x10 = dct_const_round_shift(s10); - x11 = dct_const_round_shift(s11); - x14 = dct_const_round_shift(s14); - x15 = dct_const_round_shift(s15); - - output[0] = x0; - output[1] = -x8; - output[2] = x12; - output[3] = -x4; - output[4] = x6; - output[5] = x14; - output[6] = x10; - output[7] = x2; - output[8] = x3; - output[9] = x11; - output[10] = x15; - output[11] = x7; - output[12] = x5; - output[13] = -x13; - output[14] = x9; - output[15] = -x1; -} - -#endif // HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c deleted file mode 100644 index d469d1ad0..000000000 --- a/third_party/aom/aom_dsp/mips/itrans32_cols_dspr2.c +++ /dev/null @@ -1,1042 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" -#include "aom_dsp/mips/inv_txfm_dspr2.h" -#include "aom_dsp/txfm_common.h" - -#if HAVE_DSPR2 -void aom_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; - int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; - int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; - int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; - int16_t step1_27, step1_28, step1_29, step1_30, step1_31; - int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; - int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; - int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; - int16_t step2_28, step2_29, step2_30, step2_31; - int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; - int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; - int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; - int16_t step3_28, step3_29, step3_30, step3_31; - int temp0, temp1, temp2, temp3; - int load1, load2, load3, load4; - int result1, result2; - int i, temp21; - uint8_t *dest_pix, *dest_pix1; - const int const_2_power_13 = 8192; - uint8_t *cm = aom_ff_cropTbl; - - /* prefetch aom_ff_cropTbl */ - prefetch_load(aom_ff_cropTbl); - prefetch_load(aom_ff_cropTbl + 32); - prefetch_load(aom_ff_cropTbl + 64); - prefetch_load(aom_ff_cropTbl + 96); - prefetch_load(aom_ff_cropTbl + 128); - prefetch_load(aom_ff_cropTbl + 160); - prefetch_load(aom_ff_cropTbl + 192); - prefetch_load(aom_ff_cropTbl + 224); - - for (i = 0; i < 32; ++i) { - dest_pix = dest + i; - dest_pix1 = dest + i + 31 * dest_stride; - - __asm__ __volatile__( - "lh %[load1], 2(%[input]) \n\t" - "lh %[load2], 62(%[input]) \n\t" - "lh %[load3], 34(%[input]) \n\t" - "lh %[load4], 30(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_31_64] \n\t" - "msub $ac1, %[load2], %[cospi_1_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_1_64] \n\t" - "madd $ac3, %[load2], %[cospi_31_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_15_64] \n\t" - "msub $ac2, %[load4], %[cospi_17_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_17_64] \n\t" - "madd $ac1, %[load4], %[cospi_15_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_17], $ac1, 31 \n\t" - "extp %[step1_30], $ac3, 31 \n\t" - "add %[step1_16], %[temp0], %[temp1] \n\t" - "add %[step1_31], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), - [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), - [step1_31] "=r"(step1_31) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), - [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); - - __asm__ __volatile__( - "lh %[load1], 18(%[input]) \n\t" - "lh %[load2], 46(%[input]) \n\t" - "lh %[load3], 50(%[input]) \n\t" - "lh %[load4], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_23_64] \n\t" - "msub $ac1, %[load2], %[cospi_9_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_9_64] \n\t" - "madd $ac3, %[load2], %[cospi_23_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_7_64] \n\t" - "msub $ac2, %[load4], %[cospi_25_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_25_64] \n\t" - "madd $ac1, %[load4], %[cospi_7_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "msub $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_18], $ac1, 31 \n\t" - "extp %[step1_29], $ac3, 31 \n\t" - "add %[step1_19], %[temp0], %[temp1] \n\t" - "add %[step1_28], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), - [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), - [step1_29] "=r"(step1_29) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), - [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); - - __asm__ __volatile__( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 54(%[input]) \n\t" - "lh %[load3], 42(%[input]) \n\t" - "lh %[load4], 22(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_27_64] \n\t" - "msub $ac1, %[load2], %[cospi_5_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_5_64] \n\t" - "madd $ac3, %[load2], %[cospi_27_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_11_64] \n\t" - "msub $ac2, %[load4], %[cospi_21_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_21_64] \n\t" - "madd $ac1, %[load4], %[cospi_11_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - - "madd $ac1, %[load2], %[cospi_12_64] \n\t" - "msub $ac1, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load1], %[cospi_12_64] \n\t" - "madd $ac3, %[load2], %[cospi_20_64] \n\t" - - "extp %[step1_21], $ac1, 31 \n\t" - "extp %[step1_26], $ac3, 31 \n\t" - "add %[step1_20], %[temp0], %[temp1] \n\t" - "add %[step1_27], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), - [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), - [step1_27] "=r"(step1_27) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), - [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), - [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); - - __asm__ __volatile__( - "lh %[load1], 26(%[input]) \n\t" - "lh %[load2], 38(%[input]) \n\t" - "lh %[load3], 58(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_19_64] \n\t" - "msub $ac1, %[load2], %[cospi_13_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_13_64] \n\t" - "madd $ac3, %[load2], %[cospi_19_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_3_64] \n\t" - "msub $ac2, %[load4], %[cospi_29_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_29_64] \n\t" - "madd $ac1, %[load4], %[cospi_3_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - "msub $ac1, %[load1], %[cospi_12_64] \n\t" - "msub $ac1, %[load2], %[cospi_20_64] \n\t" - "msub $ac3, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load2], %[cospi_12_64] \n\t" - "extp %[step1_22], $ac1, 31 \n\t" - "extp %[step1_25], $ac3, 31 \n\t" - "add %[step1_23], %[temp0], %[temp1] \n\t" - "add %[step1_24], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), - [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), - [step1_25] "=r"(step1_25) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), - [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), - [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); - - __asm__ __volatile__( - "lh %[load1], 4(%[input]) \n\t" - "lh %[load2], 60(%[input]) \n\t" - "lh %[load3], 36(%[input]) \n\t" - "lh %[load4], 28(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_30_64] \n\t" - "msub $ac1, %[load2], %[cospi_2_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_2_64] \n\t" - "madd $ac3, %[load2], %[cospi_30_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_14_64] \n\t" - "msub $ac2, %[load4], %[cospi_18_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_18_64] \n\t" - "madd $ac1, %[load4], %[cospi_14_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - "msub $ac1, %[load1], %[cospi_8_64] \n\t" - "madd $ac1, %[load2], %[cospi_24_64] \n\t" - "madd $ac3, %[load1], %[cospi_24_64] \n\t" - "madd $ac3, %[load2], %[cospi_8_64] \n\t" - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[temp0], %[temp1] \n\t" - "add %[step2_15], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), - [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), - [step2_15] "=r"(step2_15) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), - [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), - [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); - - __asm__ __volatile__( - "lh %[load1], 20(%[input]) \n\t" - "lh %[load2], 44(%[input]) \n\t" - "lh %[load3], 52(%[input]) \n\t" - "lh %[load4], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_10_64] \n\t" - "madd $ac3, %[load2], %[cospi_22_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_6_64] \n\t" - "msub $ac2, %[load4], %[cospi_26_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_26_64] \n\t" - "madd $ac1, %[load4], %[cospi_6_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[temp0], %[temp1] \n\t" - "add %[step2_12], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), - [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), - [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "sub %[temp0], %[step2_14], %[step2_13] \n\t" - "sub %[temp0], %[temp0], %[step2_9] \n\t" - "add %[temp0], %[temp0], %[step2_10] \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sub %[temp1], %[step2_14], %[step2_13] \n\t" - "add %[temp1], %[temp1], %[step2_9] \n\t" - "sub %[temp1], %[temp1], %[step2_10] \n\t" - "madd $ac1, %[temp1], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "sub %[temp0], %[step2_15], %[step2_12] \n\t" - "sub %[temp0], %[temp0], %[step2_8] \n\t" - "add %[temp0], %[temp0], %[step2_11] \n\t" - "madd $ac2, %[temp0], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sub %[temp1], %[step2_15], %[step2_12] \n\t" - "add %[temp1], %[temp1], %[step2_8] \n\t" - "sub %[temp1], %[temp1], %[step2_11] \n\t" - "madd $ac3, %[temp1], %[cospi_16_64] \n\t" - - "add %[step3_8], %[step2_8], %[step2_11] \n\t" - "add %[step3_9], %[step2_9], %[step2_10] \n\t" - "add %[step3_14], %[step2_13], %[step2_14] \n\t" - "add %[step3_15], %[step2_12], %[step2_15] \n\t" - "extp %[step3_10], $ac0, 31 \n\t" - "extp %[step3_13], $ac1, 31 \n\t" - "extp %[step3_11], $ac2, 31 \n\t" - "extp %[step3_12], $ac3, 31 \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), - [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), - [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), - [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), - [step3_15] "=r"(step3_15) - : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), - [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), - [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), - [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), - [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); - - step2_18 = step1_17 - step1_18; - step2_29 = step1_30 - step1_29; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" - "extp %[step3_18], $ac0, 31 \n\t" - - : [step3_18] "=r"(step3_18) - : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), - [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; - step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_19 = step1_16 - step1_19; - step2_28 = step1_31 - step1_28; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" - "extp %[step3_19], $ac0, 31 \n\t" - - : [step3_19] "=r"(step3_19) - : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), - [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; - step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_16 = step1_16 + step1_19; - step3_17 = step1_17 + step1_18; - step3_30 = step1_29 + step1_30; - step3_31 = step1_28 + step1_31; - - step2_20 = step1_23 - step1_20; - step2_27 = step1_24 - step1_27; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" - "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" - "extp %[step3_20], $ac0, 31 \n\t" - - : [step3_20] "=r"(step3_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; - step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_21 = step1_22 - step1_21; - step2_26 = step1_25 - step1_26; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" - "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" - "extp %[step3_21], $ac1, 31 \n\t" - - : [step3_21] "=r"(step3_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), - [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; - step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_22 = step1_21 + step1_22; - step3_23 = step1_20 + step1_23; - step3_24 = step1_24 + step1_27; - step3_25 = step1_25 + step1_26; - - step2_16 = step3_16 + step3_23; - step2_17 = step3_17 + step3_22; - step2_18 = step3_18 + step3_21; - step2_19 = step3_19 + step3_20; - step2_20 = step3_19 - step3_20; - step2_21 = step3_18 - step3_21; - step2_22 = step3_17 - step3_22; - step2_23 = step3_16 - step3_23; - - step2_24 = step3_31 - step3_24; - step2_25 = step3_30 - step3_25; - step2_26 = step3_29 - step3_26; - step2_27 = step3_28 - step3_27; - step2_28 = step3_28 + step3_27; - step2_29 = step3_29 + step3_26; - step2_30 = step3_30 + step3_25; - step2_31 = step3_31 + step3_24; - - __asm__ __volatile__( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 32(%[input]) \n\t" - "lh %[load3], 16(%[input]) \n\t" - "lh %[load4], 48(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[temp2], $ac3, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[temp3], $ac1, 31 \n\t" - "add %[step1_0], %[temp0], %[temp3] \n\t" - "add %[step1_1], %[temp1], %[temp2] \n\t" - "sub %[step1_2], %[temp1], %[temp2] \n\t" - "sub %[step1_3], %[temp0], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_16_64] "r"(cospi_16_64)); - - __asm__ __volatile__( - "lh %[load1], 8(%[input]) \n\t" - "lh %[load2], 56(%[input]) \n\t" - "lh %[load3], 40(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_12_64] \n\t" - "msub $ac2, %[load4], %[cospi_20_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_20_64] \n\t" - "madd $ac1, %[load4], %[cospi_12_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load1], %[load1], %[temp0] \n\t" - "add %[load1], %[load1], %[temp1] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" - "sub %[load2], %[load2], %[temp2] \n\t" - "add %[load2], %[load2], %[temp3] \n\t" - "madd $ac1, %[load1], %[cospi_16_64] \n\t" - "madd $ac3, %[load2], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - "add %[step1_4], %[temp0], %[temp1] \n\t" - "add %[step1_7], %[temp3], %[temp2] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_16_64] "r"(cospi_16_64)); - - step2_0 = step1_0 + step1_7; - step2_1 = step1_1 + step1_6; - step2_2 = step1_2 + step1_5; - step2_3 = step1_3 + step1_4; - step2_4 = step1_3 - step1_4; - step2_5 = step1_2 - step1_5; - step2_6 = step1_1 - step1_6; - step2_7 = step1_0 - step1_7; - - // stage 7 - step1_0 = step2_0 + step3_15; - step1_1 = step2_1 + step3_14; - step1_2 = step2_2 + step3_13; - step1_3 = step2_3 + step3_12; - step1_4 = step2_4 + step3_11; - step1_5 = step2_5 + step3_10; - step1_6 = step2_6 + step3_9; - step1_7 = step2_7 + step3_8; - step1_8 = step2_7 - step3_8; - step1_9 = step2_6 - step3_9; - step1_10 = step2_5 - step3_10; - step1_11 = step2_4 - step3_11; - step1_12 = step2_3 - step3_12; - step1_13 = step2_2 - step3_13; - step1_14 = step2_1 - step3_14; - step1_15 = step2_0 - step3_15; - - __asm__ __volatile__( - "sub %[temp0], %[step2_27], %[step2_20] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_20], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_20 + step2_27) * cospi_16_64; - step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_26], %[step2_21] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_21], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), - [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_21 + step2_26) * cospi_16_64; - step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_25], %[step2_22] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_22], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) - : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), - [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_22 + step2_25) * cospi_16_64; - step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_24], %[step2_23] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_23], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) - : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), - [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_23 + step2_24) * cospi_16_64; - step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_0], %[step2_31] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_1], %[step2_30] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_2], %[step2_29] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_3], %[step2_28] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), - [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), - [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), - [step2_29] "r"(step2_29), [step2_30] "r"(step2_30), - [step2_31] "r"(step2_31)); - - step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_4], %[step1_27] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_5], %[step1_26] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_6], %[step1_25] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_7], %[step1_24] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4), - [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), - [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), - [step1_25] "r"(step1_25), [step1_26] "r"(step1_26), - [step1_27] "r"(step1_27)); - - step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_8], %[step1_23] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_9], %[step1_22] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_10], %[step1_21] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_11], %[step1_20] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8), - [step1_9] "r"(step1_9), [step1_10] "r"(step1_10), - [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), - [step1_21] "r"(step1_21), [step1_22] "r"(step1_22), - [step1_23] "r"(step1_23)); - - step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_12], %[step2_19] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_13], %[step2_18] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix]) \n\t" - "add %[temp0], %[step1_14], %[step2_17] \n\t" - "addi %[temp0], %[temp0], 32 \n\t" - "sra %[temp0], %[temp0], 6 \n\t" - "add %[temp2], %[temp2], %[temp0] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "add %[temp1], %[step1_15], %[step2_16] \n\t" - "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix]) \n\t" - "addi %[temp1], %[temp1], 32 \n\t" - "sra %[temp1], %[temp1], 6 \n\t" - "add %[temp3], %[temp3], %[temp1] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix]) \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), - [step1_14] "r"(step1_14), [step1_15] "r"(step1_15), - [step2_16] "r"(step2_16), [step2_17] "r"(step2_17), - [step2_18] "r"(step2_18), [step2_19] "r"(step2_19)); - - step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); - step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); - step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6); - step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6); - - __asm__ __volatile__( - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_15] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_14] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - - "lbu %[temp2], 0(%[dest_pix1]) \n\t" - "add %[temp2], %[temp2], %[step3_13] \n\t" - "lbux %[temp0], %[temp2](%[cm]) \n\t" - "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" - "lbu %[temp3], 0(%[dest_pix1]) \n\t" - "add %[temp3], %[temp3], %[step3_12] \n\t" - "lbux %[temp1], %[temp3](%[cm]) \n\t" - "sb %[temp1], 0(%[dest_pix1]) \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), - [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); - - input += 32; - } -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c b/third_party/aom/aom_dsp/mips/itrans32_dspr2.c deleted file mode 100644 index fa7703217..000000000 --- a/third_party/aom/aom_dsp/mips/itrans32_dspr2.c +++ /dev/null @@ -1,1030 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_config.h" -#include "aom_dsp/mips/inv_txfm_dspr2.h" -#include "aom_dsp/txfm_common.h" - -#if HAVE_DSPR2 -static void idct32_rows_dspr2(const int16_t *input, int16_t *output, - uint32_t no_rows) { - int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; - int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; - int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; - int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; - int16_t step1_28, step1_29, step1_30, step1_31; - int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; - int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; - int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; - int16_t step2_28, step2_29, step2_30, step2_31; - int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; - int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; - int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; - int16_t step3_29, step3_30, step3_31; - int temp0, temp1, temp2, temp3; - int load1, load2, load3, load4; - int result1, result2; - int temp21; - int i; - const int const_2_power_13 = 8192; - const int32_t *input_int; - - for (i = no_rows; i--;) { - input_int = (const int32_t *)input; - - if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] | - input_int[4] | input_int[5] | input_int[6] | input_int[7] | - input_int[8] | input_int[9] | input_int[10] | input_int[11] | - input_int[12] | input_int[13] | input_int[14] | input_int[15])) { - input += 32; - - __asm__ __volatile__( - "sh $zero, 0(%[output]) \n\t" - "sh $zero, 64(%[output]) \n\t" - "sh $zero, 128(%[output]) \n\t" - "sh $zero, 192(%[output]) \n\t" - "sh $zero, 256(%[output]) \n\t" - "sh $zero, 320(%[output]) \n\t" - "sh $zero, 384(%[output]) \n\t" - "sh $zero, 448(%[output]) \n\t" - "sh $zero, 512(%[output]) \n\t" - "sh $zero, 576(%[output]) \n\t" - "sh $zero, 640(%[output]) \n\t" - "sh $zero, 704(%[output]) \n\t" - "sh $zero, 768(%[output]) \n\t" - "sh $zero, 832(%[output]) \n\t" - "sh $zero, 896(%[output]) \n\t" - "sh $zero, 960(%[output]) \n\t" - "sh $zero, 1024(%[output]) \n\t" - "sh $zero, 1088(%[output]) \n\t" - "sh $zero, 1152(%[output]) \n\t" - "sh $zero, 1216(%[output]) \n\t" - "sh $zero, 1280(%[output]) \n\t" - "sh $zero, 1344(%[output]) \n\t" - "sh $zero, 1408(%[output]) \n\t" - "sh $zero, 1472(%[output]) \n\t" - "sh $zero, 1536(%[output]) \n\t" - "sh $zero, 1600(%[output]) \n\t" - "sh $zero, 1664(%[output]) \n\t" - "sh $zero, 1728(%[output]) \n\t" - "sh $zero, 1792(%[output]) \n\t" - "sh $zero, 1856(%[output]) \n\t" - "sh $zero, 1920(%[output]) \n\t" - "sh $zero, 1984(%[output]) \n\t" - - : - : [output] "r"(output)); - - output += 1; - - continue; - } - - /* prefetch row */ - prefetch_load((const uint8_t *)(input + 32)); - prefetch_load((const uint8_t *)(input + 48)); - - __asm__ __volatile__( - "lh %[load1], 2(%[input]) \n\t" - "lh %[load2], 62(%[input]) \n\t" - "lh %[load3], 34(%[input]) \n\t" - "lh %[load4], 30(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_31_64] \n\t" - "msub $ac1, %[load2], %[cospi_1_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_1_64] \n\t" - "madd $ac3, %[load2], %[cospi_31_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_15_64] \n\t" - "msub $ac2, %[load4], %[cospi_17_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_17_64] \n\t" - "madd $ac1, %[load4], %[cospi_15_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_17], $ac1, 31 \n\t" - "extp %[step1_30], $ac3, 31 \n\t" - "add %[step1_16], %[temp0], %[temp1] \n\t" - "add %[step1_31], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), - [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), - [step1_31] "=r"(step1_31) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), - [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64)); - - __asm__ __volatile__( - "lh %[load1], 18(%[input]) \n\t" - "lh %[load2], 46(%[input]) \n\t" - "lh %[load3], 50(%[input]) \n\t" - "lh %[load4], 14(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_23_64] \n\t" - "msub $ac1, %[load2], %[cospi_9_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_9_64] \n\t" - "madd $ac3, %[load2], %[cospi_23_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_7_64] \n\t" - "msub $ac2, %[load4], %[cospi_25_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_25_64] \n\t" - "madd $ac1, %[load4], %[cospi_7_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "msub $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - - "extp %[step1_18], $ac1, 31 \n\t" - "extp %[step1_29], $ac3, 31 \n\t" - "add %[step1_19], %[temp0], %[temp1] \n\t" - "add %[step1_28], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), - [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), - [step1_29] "=r"(step1_29) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), - [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64)); - - __asm__ __volatile__( - "lh %[load1], 10(%[input]) \n\t" - "lh %[load2], 54(%[input]) \n\t" - "lh %[load3], 42(%[input]) \n\t" - "lh %[load4], 22(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_27_64] \n\t" - "msub $ac1, %[load2], %[cospi_5_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_5_64] \n\t" - "madd $ac3, %[load2], %[cospi_27_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_11_64] \n\t" - "msub $ac2, %[load4], %[cospi_21_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_21_64] \n\t" - "madd $ac1, %[load4], %[cospi_11_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - - "madd $ac1, %[load2], %[cospi_12_64] \n\t" - "msub $ac1, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load1], %[cospi_12_64] \n\t" - "madd $ac3, %[load2], %[cospi_20_64] \n\t" - - "extp %[step1_21], $ac1, 31 \n\t" - "extp %[step1_26], $ac3, 31 \n\t" - "add %[step1_20], %[temp0], %[temp1] \n\t" - "add %[step1_27], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), - [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), - [step1_27] "=r"(step1_27) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), - [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), - [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); - - __asm__ __volatile__( - "lh %[load1], 26(%[input]) \n\t" - "lh %[load2], 38(%[input]) \n\t" - "lh %[load3], 58(%[input]) \n\t" - "lh %[load4], 6(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_19_64] \n\t" - "msub $ac1, %[load2], %[cospi_13_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_13_64] \n\t" - "madd $ac3, %[load2], %[cospi_19_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_3_64] \n\t" - "msub $ac2, %[load4], %[cospi_29_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_29_64] \n\t" - "madd $ac1, %[load4], %[cospi_3_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_12_64] \n\t" - "msub $ac1, %[load2], %[cospi_20_64] \n\t" - "msub $ac3, %[load1], %[cospi_20_64] \n\t" - "madd $ac3, %[load2], %[cospi_12_64] \n\t" - - "extp %[step1_22], $ac1, 31 \n\t" - "extp %[step1_25], $ac3, 31 \n\t" - "add %[step1_23], %[temp0], %[temp1] \n\t" - "add %[step1_24], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), - [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), - [step1_25] "=r"(step1_25) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), - [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), - [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64)); - - __asm__ __volatile__( - "lh %[load1], 4(%[input]) \n\t" - "lh %[load2], 60(%[input]) \n\t" - "lh %[load3], 36(%[input]) \n\t" - "lh %[load4], 28(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_30_64] \n\t" - "msub $ac1, %[load2], %[cospi_2_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_2_64] \n\t" - "madd $ac3, %[load2], %[cospi_30_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_14_64] \n\t" - "msub $ac2, %[load4], %[cospi_18_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_18_64] \n\t" - "madd $ac1, %[load4], %[cospi_14_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp0], %[temp1] \n\t" - "sub %[load2], %[temp3], %[temp2] \n\t" - - "msub $ac1, %[load1], %[cospi_8_64] \n\t" - "madd $ac1, %[load2], %[cospi_24_64] \n\t" - "madd $ac3, %[load1], %[cospi_24_64] \n\t" - "madd $ac3, %[load2], %[cospi_8_64] \n\t" - - "extp %[step2_9], $ac1, 31 \n\t" - "extp %[step2_14], $ac3, 31 \n\t" - "add %[step2_8], %[temp0], %[temp1] \n\t" - "add %[step2_15], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), - [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), - [step2_15] "=r"(step2_15) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), - [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), - [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); - - __asm__ __volatile__( - "lh %[load1], 20(%[input]) \n\t" - "lh %[load2], 44(%[input]) \n\t" - "lh %[load3], 52(%[input]) \n\t" - "lh %[load4], 12(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_22_64] \n\t" - "msub $ac1, %[load2], %[cospi_10_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_10_64] \n\t" - "madd $ac3, %[load2], %[cospi_22_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_6_64] \n\t" - "msub $ac2, %[load4], %[cospi_26_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_26_64] \n\t" - "madd $ac1, %[load4], %[cospi_6_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp1], %[temp0] \n\t" - "sub %[load2], %[temp2], %[temp3] \n\t" - - "msub $ac1, %[load1], %[cospi_24_64] \n\t" - "msub $ac1, %[load2], %[cospi_8_64] \n\t" - "madd $ac3, %[load2], %[cospi_24_64] \n\t" - "msub $ac3, %[load1], %[cospi_8_64] \n\t" - - "extp %[step2_10], $ac1, 31 \n\t" - "extp %[step2_13], $ac3, 31 \n\t" - "add %[step2_11], %[temp0], %[temp1] \n\t" - "add %[step2_12], %[temp2], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), - [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), - [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64)); - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "sub %[temp0], %[step2_14], %[step2_13] \n\t" - "sub %[temp0], %[temp0], %[step2_9] \n\t" - "add %[temp0], %[temp0], %[step2_10] \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "sub %[temp1], %[step2_14], %[step2_13] \n\t" - "add %[temp1], %[temp1], %[step2_9] \n\t" - "sub %[temp1], %[temp1], %[step2_10] \n\t" - "madd $ac1, %[temp1], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "sub %[temp0], %[step2_15], %[step2_12] \n\t" - "sub %[temp0], %[temp0], %[step2_8] \n\t" - "add %[temp0], %[temp0], %[step2_11] \n\t" - "madd $ac2, %[temp0], %[cospi_16_64] \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "sub %[temp1], %[step2_15], %[step2_12] \n\t" - "add %[temp1], %[temp1], %[step2_8] \n\t" - "sub %[temp1], %[temp1], %[step2_11] \n\t" - "madd $ac3, %[temp1], %[cospi_16_64] \n\t" - - "add %[step3_8], %[step2_8], %[step2_11] \n\t" - "add %[step3_9], %[step2_9], %[step2_10] \n\t" - "add %[step3_14], %[step2_13], %[step2_14] \n\t" - "add %[step3_15], %[step2_12], %[step2_15] \n\t" - - "extp %[step3_10], $ac0, 31 \n\t" - "extp %[step3_13], $ac1, 31 \n\t" - "extp %[step3_11], $ac2, 31 \n\t" - "extp %[step3_12], $ac3, 31 \n\t" - - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), - [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), - [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), - [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), - [step3_15] "=r"(step3_15) - : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), - [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), - [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), - [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), - [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); - - step2_18 = step1_17 - step1_18; - step2_29 = step1_30 - step1_29; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" - "extp %[step3_18], $ac0, 31 \n\t" - - : [step3_18] "=r"(step3_18) - : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), - [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; - step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_19 = step1_16 - step1_19; - step2_28 = step1_31 - step1_28; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" - "extp %[step3_19], $ac0, 31 \n\t" - - : [step3_19] "=r"(step3_19) - : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), - [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; - step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_16 = step1_16 + step1_19; - step3_17 = step1_17 + step1_18; - step3_30 = step1_29 + step1_30; - step3_31 = step1_28 + step1_31; - - step2_20 = step1_23 - step1_20; - step2_27 = step1_24 - step1_27; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" - "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" - "extp %[step3_20], $ac0, 31 \n\t" - - : [step3_20] "=r"(step3_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; - step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_21 = step1_22 - step1_21; - step2_26 = step1_25 - step1_26; - - __asm__ __volatile__( - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" - "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" - "extp %[step3_21], $ac1, 31 \n\t" - - : [step3_21] "=r"(step3_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), - [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64)); - - temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; - step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_22 = step1_21 + step1_22; - step3_23 = step1_20 + step1_23; - step3_24 = step1_24 + step1_27; - step3_25 = step1_25 + step1_26; - - step2_16 = step3_16 + step3_23; - step2_17 = step3_17 + step3_22; - step2_18 = step3_18 + step3_21; - step2_19 = step3_19 + step3_20; - step2_20 = step3_19 - step3_20; - step2_21 = step3_18 - step3_21; - step2_22 = step3_17 - step3_22; - step2_23 = step3_16 - step3_23; - - step2_24 = step3_31 - step3_24; - step2_25 = step3_30 - step3_25; - step2_26 = step3_29 - step3_26; - step2_27 = step3_28 - step3_27; - step2_28 = step3_28 + step3_27; - step2_29 = step3_29 + step3_26; - step2_30 = step3_30 + step3_25; - step2_31 = step3_31 + step3_24; - - __asm__ __volatile__( - "lh %[load1], 0(%[input]) \n\t" - "lh %[load2], 32(%[input]) \n\t" - "lh %[load3], 16(%[input]) \n\t" - "lh %[load4], 48(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - "add %[result1], %[load1], %[load2] \n\t" - "sub %[result2], %[load1], %[load2] \n\t" - "madd $ac1, %[result1], %[cospi_16_64] \n\t" - "madd $ac2, %[result2], %[cospi_16_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - "madd $ac3, %[load3], %[cospi_24_64] \n\t" - "msub $ac3, %[load4], %[cospi_8_64] \n\t" - "extp %[temp2], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "madd $ac1, %[load3], %[cospi_8_64] \n\t" - "madd $ac1, %[load4], %[cospi_24_64] \n\t" - "extp %[temp3], $ac1, 31 \n\t" - - "add %[step1_0], %[temp0], %[temp3] \n\t" - "add %[step1_1], %[temp1], %[temp2] \n\t" - "sub %[step1_2], %[temp1], %[temp2] \n\t" - "sub %[step1_3], %[temp0], %[temp3] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [result1] "=&r"(result1), - [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64) - - ); - - __asm__ __volatile__( - "lh %[load1], 8(%[input]) \n\t" - "lh %[load2], 56(%[input]) \n\t" - "lh %[load3], 40(%[input]) \n\t" - "lh %[load4], 24(%[input]) \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "madd $ac1, %[load1], %[cospi_28_64] \n\t" - "msub $ac1, %[load2], %[cospi_4_64] \n\t" - "extp %[temp0], $ac1, 31 \n\t" - - "madd $ac3, %[load1], %[cospi_4_64] \n\t" - "madd $ac3, %[load2], %[cospi_28_64] \n\t" - "extp %[temp3], $ac3, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" - "mthi $zero, $ac2 \n\t" - - "madd $ac2, %[load3], %[cospi_12_64] \n\t" - "msub $ac2, %[load4], %[cospi_20_64] \n\t" - "extp %[temp1], $ac2, 31 \n\t" - - "madd $ac1, %[load3], %[cospi_20_64] \n\t" - "madd $ac1, %[load4], %[cospi_12_64] \n\t" - "extp %[temp2], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" - "mthi $zero, $ac3 \n\t" - - "sub %[load1], %[temp3], %[temp2] \n\t" - "sub %[load1], %[load1], %[temp0] \n\t" - "add %[load1], %[load1], %[temp1] \n\t" - - "sub %[load2], %[temp0], %[temp1] \n\t" - "sub %[load2], %[load2], %[temp2] \n\t" - "add %[load2], %[load2], %[temp3] \n\t" - - "madd $ac1, %[load1], %[cospi_16_64] \n\t" - "madd $ac3, %[load2], %[cospi_16_64] \n\t" - - "extp %[step1_5], $ac1, 31 \n\t" - "extp %[step1_6], $ac3, 31 \n\t" - "add %[step1_4], %[temp0], %[temp1] \n\t" - "add %[step1_7], %[temp3], %[temp2] \n\t" - - : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), - [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) - : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_16_64] "r"(cospi_16_64)); - - step2_0 = step1_0 + step1_7; - step2_1 = step1_1 + step1_6; - step2_2 = step1_2 + step1_5; - step2_3 = step1_3 + step1_4; - step2_4 = step1_3 - step1_4; - step2_5 = step1_2 - step1_5; - step2_6 = step1_1 - step1_6; - step2_7 = step1_0 - step1_7; - - step1_0 = step2_0 + step3_15; - step1_1 = step2_1 + step3_14; - step1_2 = step2_2 + step3_13; - step1_3 = step2_3 + step3_12; - step1_4 = step2_4 + step3_11; - step1_5 = step2_5 + step3_10; - step1_6 = step2_6 + step3_9; - step1_7 = step2_7 + step3_8; - step1_8 = step2_7 - step3_8; - step1_9 = step2_6 - step3_9; - step1_10 = step2_5 - step3_10; - step1_11 = step2_4 - step3_11; - step1_12 = step2_3 - step3_12; - step1_13 = step2_2 - step3_13; - step1_14 = step2_1 - step3_14; - step1_15 = step2_0 - step3_15; - - __asm__ __volatile__( - "sub %[temp0], %[step2_27], %[step2_20] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_20], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_20 + step2_27) * cospi_16_64; - step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_26], %[step2_21] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_21], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), - [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_21 + step2_26) * cospi_16_64; - step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_25], %[step2_22] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_22], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) - : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), - [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_22 + step2_25) * cospi_16_64; - step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_24], %[step2_23] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_23], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) - : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), - [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_23 + step2_24) * cospi_16_64; - step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - // final stage - output[0 * 32] = step1_0 + step2_31; - output[1 * 32] = step1_1 + step2_30; - output[2 * 32] = step1_2 + step2_29; - output[3 * 32] = step1_3 + step2_28; - output[4 * 32] = step1_4 + step1_27; - output[5 * 32] = step1_5 + step1_26; - output[6 * 32] = step1_6 + step1_25; - output[7 * 32] = step1_7 + step1_24; - output[8 * 32] = step1_8 + step1_23; - output[9 * 32] = step1_9 + step1_22; - output[10 * 32] = step1_10 + step1_21; - output[11 * 32] = step1_11 + step1_20; - output[12 * 32] = step1_12 + step2_19; - output[13 * 32] = step1_13 + step2_18; - output[14 * 32] = step1_14 + step2_17; - output[15 * 32] = step1_15 + step2_16; - output[16 * 32] = step1_15 - step2_16; - output[17 * 32] = step1_14 - step2_17; - output[18 * 32] = step1_13 - step2_18; - output[19 * 32] = step1_12 - step2_19; - output[20 * 32] = step1_11 - step1_20; - output[21 * 32] = step1_10 - step1_21; - output[22 * 32] = step1_9 - step1_22; - output[23 * 32] = step1_8 - step1_23; - output[24 * 32] = step1_7 - step1_24; - output[25 * 32] = step1_6 - step1_25; - output[26 * 32] = step1_5 - step1_26; - output[27 * 32] = step1_4 - step1_27; - output[28 * 32] = step1_3 - step2_28; - output[29 * 32] = step1_2 - step2_29; - output[30 * 32] = step1_1 - step2_30; - output[31 * 32] = step1_0 - step2_31; - - input += 32; - output += 1; - } -} - -void aom_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[32 * 32]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - // Rows - idct32_rows_dspr2(input, outptr, 32); - - // Columns - aom_idct32_cols_add_blk_dspr2(out, dest, dest_stride); -} - -void aom_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, - int stride) { - DECLARE_ALIGNED(32, int16_t, out[32 * 32]); - int16_t *outptr = out; - uint32_t i; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - // Rows - idct32_rows_dspr2(input, outptr, 8); - - outptr += 8; - __asm__ __volatile__( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 4(%[outptr]) \n\t" - "sw $zero, 8(%[outptr]) \n\t" - "sw $zero, 12(%[outptr]) \n\t" - "sw $zero, 16(%[outptr]) \n\t" - "sw $zero, 20(%[outptr]) \n\t" - "sw $zero, 24(%[outptr]) \n\t" - "sw $zero, 28(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 36(%[outptr]) \n\t" - "sw $zero, 40(%[outptr]) \n\t" - "sw $zero, 44(%[outptr]) \n\t" - - : - : [outptr] "r"(outptr)); - - for (i = 0; i < 31; ++i) { - outptr += 32; - - __asm__ __volatile__( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 4(%[outptr]) \n\t" - "sw $zero, 8(%[outptr]) \n\t" - "sw $zero, 12(%[outptr]) \n\t" - "sw $zero, 16(%[outptr]) \n\t" - "sw $zero, 20(%[outptr]) \n\t" - "sw $zero, 24(%[outptr]) \n\t" - "sw $zero, 28(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 36(%[outptr]) \n\t" - "sw $zero, 40(%[outptr]) \n\t" - "sw $zero, 44(%[outptr]) \n\t" - - : - : [outptr] "r"(outptr)); - } - - // Columns - aom_idct32_cols_add_blk_dspr2(out, dest, stride); -} - -void aom_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, - int stride) { - int r, out; - int32_t a1, absa1; - int32_t vector_a1; - int32_t t1, t2, t3, t4; - int32_t vector_1, vector_2, vector_3, vector_4; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - - : - : [pos] "r"(pos)); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__( - "addi %[out], %[out], 32 \n\t" - "sra %[a1], %[out], 6 \n\t" - - : [out] "+r"(out), [a1] "=r"(a1) - :); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 32; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - - "lw %[t1], 16(%[dest]) \n\t" - "lw %[t2], 20(%[dest]) \n\t" - "lw %[t3], 24(%[dest]) \n\t" - "lw %[t4], 28(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 16(%[dest]) \n\t" - "sw %[vector_2], 20(%[dest]) \n\t" - "sw %[vector_3], 24(%[dest]) \n\t" - "sw %[vector_4], 28(%[dest]) \n\t" - - "add %[dest], %[dest], %[stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), - [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), - [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), - [dest] "+&r"(dest) - : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - - : [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 32; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "lw %[t3], 8(%[dest]) \n\t" - "lw %[t4], 12(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "sw %[vector_3], 8(%[dest]) \n\t" - "sw %[vector_4], 12(%[dest]) \n\t" - - "lw %[t1], 16(%[dest]) \n\t" - "lw %[t2], 20(%[dest]) \n\t" - "lw %[t3], 24(%[dest]) \n\t" - "lw %[t4], 28(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t" - "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t" - "sw %[vector_1], 16(%[dest]) \n\t" - "sw %[vector_2], 20(%[dest]) \n\t" - "sw %[vector_3], 24(%[dest]) \n\t" - "sw %[vector_4], 28(%[dest]) \n\t" - - "add %[dest], %[dest], %[stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), - [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), - [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), - [dest] "+&r"(dest) - : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); - } - } -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c b/third_party/aom/aom_dsp/mips/itrans4_dspr2.c deleted file mode 100644 index e6d0367cd..000000000 --- a/third_party/aom/aom_dsp/mips/itrans4_dspr2.c +++ /dev/null @@ -1,342 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/inv_txfm_dspr2.h" -#include "aom_dsp/txfm_common.h" - -#if HAVE_DSPR2 -void aom_idct4_rows_dspr2(const int16_t *input, int16_t *output) { - int16_t step_0, step_1, step_2, step_3; - int Temp0, Temp1, Temp2, Temp3; - const int const_2_power_13 = 8192; - int i; - - for (i = 4; i--;) { - __asm__ __volatile__( - /* - temp_1 = (input[0] + input[2]) * cospi_16_64; - step_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[2]) * cospi_16_64; - step_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 4(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "extp %[step_0], $ac0, 31 \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "extp %[step_1], $ac1, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - /* - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - step_2 = dct_const_round_shift(temp1); - */ - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "extp %[step_2], $ac0, 31 \n\t" - - /* - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step_3 = dct_const_round_shift(temp2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[step_3], $ac1, 31 \n\t" - - /* - output[0] = step_0 + step_3; - output[4] = step_1 + step_2; - output[8] = step_1 - step_2; - output[12] = step_0 - step_3; - */ - "add %[Temp0], %[step_0], %[step_3] \n\t" - "sh %[Temp0], 0(%[output]) \n\t" - - "add %[Temp1], %[step_1], %[step_2] \n\t" - "sh %[Temp1], 8(%[output]) \n\t" - - "sub %[Temp2], %[step_1], %[step_2] \n\t" - "sh %[Temp2], 16(%[output]) \n\t" - - "sub %[Temp3], %[step_0], %[step_3] \n\t" - "sh %[Temp3], 24(%[output]) \n\t" - - : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), - [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output) - : [const_2_power_13] "r"(const_2_power_13), - [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), - [cospi_24_64] "r"(cospi_24_64), [input] "r"(input)); - - input += 4; - output += 1; - } -} - -void aom_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t step_0, step_1, step_2, step_3; - int Temp0, Temp1, Temp2, Temp3; - const int const_2_power_13 = 8192; - int i; - uint8_t *dest_pix; - uint8_t *cm = aom_ff_cropTbl; - - /* prefetch aom_ff_cropTbl */ - prefetch_load(aom_ff_cropTbl); - prefetch_load(aom_ff_cropTbl + 32); - prefetch_load(aom_ff_cropTbl + 64); - prefetch_load(aom_ff_cropTbl + 96); - prefetch_load(aom_ff_cropTbl + 128); - prefetch_load(aom_ff_cropTbl + 160); - prefetch_load(aom_ff_cropTbl + 192); - prefetch_load(aom_ff_cropTbl + 224); - - for (i = 0; i < 4; ++i) { - dest_pix = (dest + i); - - __asm__ __volatile__( - /* - temp_1 = (input[0] + input[2]) * cospi_16_64; - step_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[2]) * cospi_16_64; - step_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 4(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "extp %[step_0], $ac0, 31 \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "extp %[step_1], $ac1, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - /* - temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; - step_2 = dct_const_round_shift(temp1); - */ - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "extp %[step_2], $ac0, 31 \n\t" - - /* - temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step_3 = dct_const_round_shift(temp2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[step_3], $ac1, 31 \n\t" - - /* - output[0] = step_0 + step_3; - output[4] = step_1 + step_2; - output[8] = step_1 - step_2; - output[12] = step_0 - step_3; - */ - "add %[Temp0], %[step_0], %[step_3] \n\t" - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step_1], %[step_2] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step_1], %[step_2] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step_0], %[step_3] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "addi %[Temp0], %[Temp0], 8 \n\t" - "sra %[Temp0], %[Temp0], 4 \n\t" - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - - : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), - [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), - [dest_pix] "+r"(dest_pix) - : [const_2_power_13] "r"(const_2_power_13), - [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), - [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), - [dest_stride] "r"(dest_stride)); - - input += 4; - } -} - -void aom_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[4 * 4]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - : - : [pos] "r"(pos)); - - // Rows - aom_idct4_rows_dspr2(input, outptr); - - // Columns - aom_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); -} - -void aom_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - int a1, absa1; - int r; - int32_t out; - int t2, vector_a1, vector_a; - uint32_t pos = 45; - int16_t input_dc = input[0]; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - - : - : [pos] "r"(pos)); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc); - __asm__ __volatile__( - "addi %[out], %[out], 8 \n\t" - "sra %[a1], %[out], 4 \n\t" - - : [out] "+r"(out), [a1] "=r"(a1) - :); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 4; r--;) { - __asm__ __volatile__( - "lw %[t2], 0(%[dest]) \n\t" - "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" - "sw %[vector_a], 0(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - : [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 4; r--;) { - __asm__ __volatile__( - "lw %[t2], 0(%[dest]) \n\t" - "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" - "sw %[vector_a], 0(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } -} - -void iadst4_dspr2(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; - int x0, x1, x2, x3; - - x0 = input[0]; - x1 = input[1]; - x2 = input[2]; - x3 = input[3]; - - if (!(x0 | x1 | x2 | x3)) { - output[0] = output[1] = output[2] = output[3] = 0; - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = x0 - x2 + x3; - - x0 = s0 + s3 + s5; - x1 = s1 - s4 - s6; - x2 = sinpi_3_9 * s7; - x3 = s2; - - s0 = x0 + x3; - s1 = x1 + x3; - s2 = x2; - s3 = x0 + x1 - x3; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = dct_const_round_shift(s0); - output[1] = dct_const_round_shift(s1); - output[2] = dct_const_round_shift(s2); - output[3] = dct_const_round_shift(s3); -} -#endif // #if HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c b/third_party/aom/aom_dsp/mips/itrans8_dspr2.c deleted file mode 100644 index 0a20f76f2..000000000 --- a/third_party/aom/aom_dsp/mips/itrans8_dspr2.c +++ /dev/null @@ -1,645 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/mips/inv_txfm_dspr2.h" -#include "aom_dsp/txfm_common.h" - -#if HAVE_DSPR2 -void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - const int const_2_power_13 = 8192; - int Temp0, Temp1, Temp2, Temp3, Temp4; - int i; - - for (i = no_rows; i--;) { - __asm__ __volatile__( - /* - temp_1 = (input[0] + input[4]) * cospi_16_64; - step2_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[4]) * cospi_16_64; - step2_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 8(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "extp %[Temp4], $ac0, 31 \n\t" - - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - /* - temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; - step2_2 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 4(%[input]) \n\t" - "lh %[Temp1], 12(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "extp %[Temp3], $ac0, 31 \n\t" - - /* - step1_1 = step2_1 + step2_2; - step1_2 = step2_1 - step2_2; - */ - "add %[step1_1], %[Temp2], %[Temp3] \n\t" - "sub %[step1_2], %[Temp2], %[Temp3] \n\t" - - /* - temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; - step2_3 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - /* - step1_0 = step2_0 + step2_3; - step1_3 = step2_0 - step2_3; - */ - "add %[step1_0], %[Temp4], %[Temp1] \n\t" - "sub %[step1_3], %[Temp4], %[Temp1] \n\t" - - /* - temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - step1_4 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 2(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp1], 14(%[input]) \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" - "extp %[step1_4], $ac0, 31 \n\t" - - /* - temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1_7 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" - "extp %[step1_7], $ac1, 31 \n\t" - - /* - temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - step1_5 = dct_const_round_shift(temp_1); - */ - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" - "extp %[step1_5], $ac0, 31 \n\t" - - /* - temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1_6 = dct_const_round_shift(temp_2); - */ - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - /* - temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; - temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; - */ - "sub %[Temp0], %[step1_7], %[step1_6] \n\t" - "sub %[Temp0], %[Temp0], %[step1_4] \n\t" - "add %[Temp0], %[Temp0], %[step1_5] \n\t" - "sub %[Temp1], %[step1_4], %[step1_5] \n\t" - "sub %[Temp1], %[Temp1], %[step1_6] \n\t" - "add %[Temp1], %[Temp1], %[step1_7] \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" - - /* - step1_4 = step1_4 + step1_5; - step1_7 = step1_6 + step1_7; - */ - "add %[step1_4], %[step1_4], %[step1_5] \n\t" - "add %[step1_7], %[step1_7], %[step1_6] \n\t" - - "extp %[step1_5], $ac0, 31 \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - "add %[Temp0], %[step1_0], %[step1_7] \n\t" - "sh %[Temp0], 0(%[output]) \n\t" - "add %[Temp1], %[step1_1], %[step1_6] \n\t" - "sh %[Temp1], 16(%[output]) \n\t" - "add %[Temp0], %[step1_2], %[step1_5] \n\t" - "sh %[Temp0], 32(%[output]) \n\t" - "add %[Temp1], %[step1_3], %[step1_4] \n\t" - "sh %[Temp1], 48(%[output]) \n\t" - - "sub %[Temp0], %[step1_3], %[step1_4] \n\t" - "sh %[Temp0], 64(%[output]) \n\t" - "sub %[Temp1], %[step1_2], %[step1_5] \n\t" - "sh %[Temp1], 80(%[output]) \n\t" - "sub %[Temp0], %[step1_1], %[step1_6] \n\t" - "sh %[Temp0], 96(%[output]) \n\t" - "sub %[Temp1], %[step1_0], %[step1_7] \n\t" - "sh %[Temp1], 112(%[output]) \n\t" - - : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), - [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), - [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), - [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), - [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) - : [const_2_power_13] "r"(const_2_power_13), - [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_24_64] "r"(cospi_24_64), [output] "r"(output), - [input] "r"(input)); - - input += 8; - output += 1; - } -} - -void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; - int Temp0, Temp1, Temp2, Temp3; - int i; - const int const_2_power_13 = 8192; - uint8_t *dest_pix; - uint8_t *cm = aom_ff_cropTbl; - - /* prefetch aom_ff_cropTbl */ - prefetch_load(aom_ff_cropTbl); - prefetch_load(aom_ff_cropTbl + 32); - prefetch_load(aom_ff_cropTbl + 64); - prefetch_load(aom_ff_cropTbl + 96); - prefetch_load(aom_ff_cropTbl + 128); - prefetch_load(aom_ff_cropTbl + 160); - prefetch_load(aom_ff_cropTbl + 192); - prefetch_load(aom_ff_cropTbl + 224); - - for (i = 0; i < 8; ++i) { - dest_pix = (dest + i); - - __asm__ __volatile__( - /* - temp_1 = (input[0] + input[4]) * cospi_16_64; - step2_0 = dct_const_round_shift(temp_1); - - temp_2 = (input[0] - input[4]) * cospi_16_64; - step2_1 = dct_const_round_shift(temp_2); - */ - "lh %[Temp0], 0(%[input]) \n\t" - "lh %[Temp1], 8(%[input]) \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "add %[Temp2], %[Temp0], %[Temp1] \n\t" - "madd $ac0, %[Temp2], %[cospi_16_64] \n\t" - "extp %[step1_6], $ac0, 31 \n\t" - - "sub %[Temp3], %[Temp0], %[Temp1] \n\t" - "madd $ac1, %[Temp3], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "extp %[Temp2], $ac1, 31 \n\t" - - /* - temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64; - step2_2 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 4(%[input]) \n\t" - "lh %[Temp1], 12(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_24_64] \n\t" - "msub $ac0, %[Temp1], %[cospi_8_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "extp %[Temp3], $ac0, 31 \n\t" - - /* - step1_1 = step2_1 + step2_2; - step1_2 = step2_1 - step2_2; - */ - "add %[step1_1], %[Temp2], %[Temp3] \n\t" - "sub %[step1_2], %[Temp2], %[Temp3] \n\t" - - /* - temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64; - step2_3 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_8_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_24_64] \n\t" - "extp %[Temp1], $ac1, 31 \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - - /* - step1_0 = step2_0 + step2_3; - step1_3 = step2_0 - step2_3; - */ - "add %[step1_0], %[step1_6], %[Temp1] \n\t" - "sub %[step1_3], %[step1_6], %[Temp1] \n\t" - - /* - temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; - step1_4 = dct_const_round_shift(temp_1); - */ - "lh %[Temp0], 2(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_28_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp1], 14(%[input]) \n\t" - "lh %[Temp0], 2(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_4_64] \n\t" - "extp %[step1_4], $ac0, 31 \n\t" - - /* - temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1_7 = dct_const_round_shift(temp_2); - */ - "madd $ac1, %[Temp0], %[cospi_4_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_28_64] \n\t" - "extp %[step1_7], $ac1, 31 \n\t" - - /* - temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; - step1_5 = dct_const_round_shift(temp_1); - */ - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac0, %[Temp0], %[cospi_12_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "msub $ac0, %[Temp1], %[cospi_20_64] \n\t" - "extp %[step1_5], $ac0, 31 \n\t" - - /* - temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1_6 = dct_const_round_shift(temp_2); - */ - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - "lh %[Temp0], 10(%[input]) \n\t" - "madd $ac1, %[Temp0], %[cospi_20_64] \n\t" - "lh %[Temp1], 6(%[input]) \n\t" - "madd $ac1, %[Temp1], %[cospi_12_64] \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - /* - temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64; - temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64; - */ - "sub %[Temp0], %[step1_7], %[step1_6] \n\t" - "sub %[Temp0], %[Temp0], %[step1_4] \n\t" - "add %[Temp0], %[Temp0], %[step1_5] \n\t" - "sub %[Temp1], %[step1_4], %[step1_5] \n\t" - "sub %[Temp1], %[Temp1], %[step1_6] \n\t" - "add %[Temp1], %[Temp1], %[step1_7] \n\t" - - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" - "mthi $zero, $ac1 \n\t" - - "madd $ac0, %[Temp0], %[cospi_16_64] \n\t" - "madd $ac1, %[Temp1], %[cospi_16_64] \n\t" - - /* - step1_4 = step1_4 + step1_5; - step1_7 = step1_6 + step1_7; - */ - "add %[step1_4], %[step1_4], %[step1_5] \n\t" - "add %[step1_7], %[step1_7], %[step1_6] \n\t" - - "extp %[step1_5], $ac0, 31 \n\t" - "extp %[step1_6], $ac1, 31 \n\t" - - /* add block */ - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "add %[Temp0], %[step1_0], %[step1_7] \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step1_1], %[step1_6] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step1_2], %[step1_5] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step1_3], %[step1_4] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_3], %[step1_4] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_2], %[step1_5] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_1], %[step1_6] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step1_0], %[step1_7] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" - - "lbu %[Temp1], 0(%[dest_pix]) \n\t" - "addi %[Temp0], %[Temp0], 16 \n\t" - "sra %[Temp0], %[Temp0], 5 \n\t" - "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - - : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), - [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), - [step1_4] "=&r"(step1_4), [step1_5] "=&r"(step1_5), - [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), - [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), - [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix) - : [const_2_power_13] "r"(const_2_power_13), - [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), - [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), - [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), - [dest_stride] "r"(dest_stride)); - - input += 8; - } -} - -void aom_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[8 * 8]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); - - // First transform rows - idct8_rows_dspr2(input, outptr, 8); - - // Then transform columns and add to dest - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); -} - -void aom_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - DECLARE_ALIGNED(32, int16_t, out[8 * 8]); - int16_t *outptr = out; - uint32_t pos = 45; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" : : [pos] "r"(pos)); - - // First transform rows - idct8_rows_dspr2(input, outptr, 4); - - outptr += 4; - - __asm__ __volatile__( - "sw $zero, 0(%[outptr]) \n\t" - "sw $zero, 4(%[outptr]) \n\t" - "sw $zero, 16(%[outptr]) \n\t" - "sw $zero, 20(%[outptr]) \n\t" - "sw $zero, 32(%[outptr]) \n\t" - "sw $zero, 36(%[outptr]) \n\t" - "sw $zero, 48(%[outptr]) \n\t" - "sw $zero, 52(%[outptr]) \n\t" - "sw $zero, 64(%[outptr]) \n\t" - "sw $zero, 68(%[outptr]) \n\t" - "sw $zero, 80(%[outptr]) \n\t" - "sw $zero, 84(%[outptr]) \n\t" - "sw $zero, 96(%[outptr]) \n\t" - "sw $zero, 100(%[outptr]) \n\t" - "sw $zero, 112(%[outptr]) \n\t" - "sw $zero, 116(%[outptr]) \n\t" - - : - : [outptr] "r"(outptr)); - - // Then transform columns and add to dest - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); -} - -void aom_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { - uint32_t pos = 45; - int32_t out; - int32_t r; - int32_t a1, absa1; - int32_t t1, t2, vector_a1, vector_1, vector_2; - - /* bit positon for extract from acc */ - __asm__ __volatile__("wrdsp %[pos], 1 \n\t" - - : - : [pos] "r"(pos)); - - out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]); - __asm__ __volatile__( - "addi %[out], %[out], 16 \n\t" - "sra %[a1], %[out], 5 \n\t" - - : [out] "+r"(out), [a1] "=r"(a1) - :); - - if (a1 < 0) { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__( - "abs %[absa1], %[a1] \n\t" - "replv.qb %[vector_a1], %[absa1] \n\t" - - : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 8; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), - [vector_2] "=&r"(vector_2), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } else { - /* use quad-byte - * input and output memory are four byte aligned */ - __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - - : [vector_a1] "=r"(vector_a1) - : [a1] "r"(a1)); - - for (r = 8; r--;) { - __asm__ __volatile__( - "lw %[t1], 0(%[dest]) \n\t" - "lw %[t2], 4(%[dest]) \n\t" - "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t" - "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" - "sw %[vector_1], 0(%[dest]) \n\t" - "sw %[vector_2], 4(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" - - : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), - [vector_2] "=&r"(vector_2), [dest] "+r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); - } - } -} - -void iadst8_dspr2(const int16_t *input, int16_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; - int x0, x1, x2, x3, x4, x5, x6, x7; - - x0 = input[7]; - x1 = input[0]; - x2 = input[5]; - x3 = input[2]; - x4 = input[3]; - x5 = input[4]; - x6 = input[1]; - x7 = input[6]; - - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = 0; - return; - } - - // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - - x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS); - x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS); - x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS); - x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS); - x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS); - x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS); - x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS); - x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; - - x0 = s0 + s2; - x1 = s1 + s3; - x2 = s0 - s2; - x3 = s1 - s3; - x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS); - x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS); - x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS); - x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS); - - // stage 3 - s2 = cospi_16_64 * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); - - x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS); - x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS); - x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS); - x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS); - - output[0] = x0; - output[1] = -x4; - output[2] = x6; - output[3] = -x2; - output[4] = x3; - output[5] = -x7; - output[6] = x5; - output[7] = -x1; -} -#endif // HAVE_DSPR2 diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c index fc0c32ce3..38a10e9b2 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c +++ b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c @@ -404,10 +404,11 @@ void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) { } } -void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr, int32_t count) { +static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr, + int32_t count) { DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]); uint8_t early_exit = 0; @@ -639,19 +640,19 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, } } } else { - aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr, - thresh_ptr, count); + mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, + count); } } -void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch, - const uint8_t *b_limit_ptr, - const uint8_t *limit_ptr, - const uint8_t *thresh_ptr) { +void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch, + const uint8_t *b_limit_ptr, + const uint8_t *limit_ptr, + const uint8_t *thresh_ptr) { mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1); } -void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch, +void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch, const uint8_t *b_limit_ptr, const uint8_t *limit_ptr, const uint8_t *thresh_ptr) { diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c index 883d0523d..8c41278be 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c @@ -11,7 +11,8 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/mips/common_dspr2.h" #include "aom_dsp/mips/loopfilter_filters_dspr2.h" diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h index 72df09823..3e38ef3fb 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h @@ -14,7 +14,8 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h index 3e6994714..cb599cf2e 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h @@ -14,7 +14,8 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_mem/aom_mem.h" diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h index 8db3e521f..6db1dac08 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h @@ -14,7 +14,8 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_mem/aom_mem.h" diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c index a3b5a9eb1..b67ccfe9d 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c @@ -11,7 +11,8 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/mips/common_dspr2.h" #include "aom_dsp/mips/loopfilter_filters_dspr2.h" diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c index 8d2fd69f7..34733e42e 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c @@ -11,7 +11,8 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/mips/common_dspr2.h" #include "aom_dsp/mips/loopfilter_filters_dspr2.h" @@ -718,14 +719,13 @@ static void mb_lpf_horizontal_edge(unsigned char *s, int pitch, } } -void aom_lpf_horizontal_edge_8_dspr2(unsigned char *s, int pitch, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { +void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch, + const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh) { mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1); } -void aom_lpf_horizontal_edge_16_dspr2(unsigned char *s, int pitch, +void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c index 28528869b..3d3f1ec97 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c +++ b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c @@ -11,7 +11,8 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" #include "aom_dsp/mips/common_dspr2.h" #include "aom_dsp/mips/loopfilter_filters_dspr2.h" diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h index 48fbcfd47..eb919d42b 100644 --- a/third_party/aom/aom_dsp/mips/macros_msa.h +++ b/third_party/aom/aom_dsp/mips/macros_msa.h @@ -14,7 +14,8 @@ #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc)) diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c index 258eb5c07..58cdd80d9 100644 --- a/third_party/aom/aom_dsp/mips/sad_msa.c +++ b/third_party/aom/aom_dsp/mips/sad_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/macros_msa.h" #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \ @@ -160,640 +161,6 @@ static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride, return sad; } -static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v16u8 ref0, ref1, ref2, ref3, diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - src_ptr += (4 * src_stride); - INSERT_W4_UB(src0, src1, src2, src3, src); - - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad0 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref00, ref11, ref22, ref33; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); - ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, - ref0, ref1); - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src, ref, ref0, ref1, diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - - for (ht_cnt = height >> 1; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); -} - -static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v4u32 sad; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3); - ref0_4 = LD_UB(ref + 64); - ref += ref_stride; - - sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[0] = HADD_SW_S32((v4i32)sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[1] = HADD_SW_S32((v4i32)sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[2] = HADD_SW_S32((v4i32)sad); -} - -static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - uint32_t src0, src1, src2, src3; - v16u8 ref0, ref1, ref2, ref3, diff; - v16u8 src = { 0 }; - v16u8 ref = { 0 }; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LW4(src_ptr, src_stride, src0, src1, src2, src3); - INSERT_W4_UB(src0, src1, src2, src3, src); - src_ptr += (4 * src_stride); - LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); - ref_ptr += (4 * ref_stride); - - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad0 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1); - SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1); - SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0, ref1, ref00, ref11, ref22, ref33; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 2); ht_cnt--;) { - LD_UB4(src, src_stride, src0, src1, src2, src3); - src += (4 * src_stride); - LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33); - ref += (4 * ref_stride); - PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1, - ref0, ref1); - sad0 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1); - SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1); - PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1); - sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *ref_ptr, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src, ref0, ref1, ref; - v16u8 diff; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = (height >> 1); ht_cnt--;) { - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - - src = LD_UB(src_ptr); - src_ptr += src_stride; - LD_UB2(ref_ptr, 16, ref0, ref1); - ref_ptr += ref_stride; - - diff = __msa_asub_u_b(src, ref0); - sad0 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1); - diff = __msa_asub_u_b(src, ref); - sad1 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2); - diff = __msa_asub_u_b(src, ref); - sad2 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3); - diff = __msa_asub_u_b(src, ref); - sad3 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4); - diff = __msa_asub_u_b(src, ref); - sad4 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5); - diff = __msa_asub_u_b(src, ref); - sad5 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6); - diff = __msa_asub_u_b(src, ref); - sad6 += __msa_hadd_u_h(diff, diff); - - ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7); - diff = __msa_asub_u_b(src, ref); - sad7 += __msa_hadd_u_h(diff, diff); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - int32_t ht_cnt; - v16u8 src0, src1; - v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2; - v8u16 sad0 = { 0 }; - v8u16 sad1 = { 0 }; - v8u16 sad2 = { 0 }; - v8u16 sad3 = { 0 }; - v8u16 sad4 = { 0 }; - v8u16 sad5 = { 0 }; - v8u16 sad6 = { 0 }; - v8u16 sad7 = { 0 }; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB2(src, 16, src0, src1); - src += src_stride; - LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2); - ref += ref_stride; - - sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - sad1 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - sad2 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); - sad3 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); - sad4 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); - sad5 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); - sad6 += SAD_UB2_UH(src0, src1, ref0, ref1); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); - sad7 += SAD_UB2_UH(src0, src1, ref0, ref1); - } - - sad_array[0] = HADD_UH_U32(sad0); - sad_array[1] = HADD_UH_U32(sad1); - sad_array[2] = HADD_UH_U32(sad2); - sad_array[3] = HADD_UH_U32(sad3); - sad_array[4] = HADD_UH_U32(sad4); - sad_array[5] = HADD_UH_U32(sad5); - sad_array[6] = HADD_UH_U32(sad6); - sad_array[7] = HADD_UH_U32(sad7); -} - -static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *ref, int32_t ref_stride, - int32_t height, uint32_t *sad_array) { - const uint8_t *src_dup, *ref_dup; - int32_t ht_cnt; - v16u8 src0, src1, src2, src3; - v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4; - v16u8 ref0, ref1, ref2, ref3; - v8u16 sad0_0 = { 0 }; - v8u16 sad0_1 = { 0 }; - v8u16 sad1_0 = { 0 }; - v8u16 sad1_1 = { 0 }; - v8u16 sad2_0 = { 0 }; - v8u16 sad2_1 = { 0 }; - v8u16 sad3_0 = { 0 }; - v8u16 sad3_1 = { 0 }; - v4u32 sad; - - src_dup = src; - ref_dup = ref; - - for (ht_cnt = height; ht_cnt--;) { - LD_UB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); - ref += ref_stride; - - sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1); - sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3); - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[0] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[1] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[2] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad3_0, sad3_0); - sad += __msa_hadd_u_w(sad3_1, sad3_1); - sad_array[3] = HADD_SW_S32(sad); - - sad0_0 = (v8u16)__msa_ldi_h(0); - sad0_1 = (v8u16)__msa_ldi_h(0); - sad1_0 = (v8u16)__msa_ldi_h(0); - sad1_1 = (v8u16)__msa_ldi_h(0); - sad2_0 = (v8u16)__msa_ldi_h(0); - sad2_1 = (v8u16)__msa_ldi_h(0); - sad3_0 = (v8u16)__msa_ldi_h(0); - sad3_1 = (v8u16)__msa_ldi_h(0); - - for (ht_cnt = 64; ht_cnt--;) { - LD_UB4(src_dup, 16, src0, src1, src2, src3); - src_dup += src_stride; - LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4); - ref_dup += ref_stride; - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4); - sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5); - sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6); - sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - - SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7); - SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7); - sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1); - sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); - } - - sad = __msa_hadd_u_w(sad0_0, sad0_0); - sad += __msa_hadd_u_w(sad0_1, sad0_1); - sad_array[4] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad1_0, sad1_0); - sad += __msa_hadd_u_w(sad1_1, sad1_1); - sad_array[5] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad2_0, sad2_0); - sad += __msa_hadd_u_w(sad2_1, sad2_1); - sad_array[6] = HADD_SW_S32(sad); - - sad = __msa_hadd_u_w(sad3_0, sad3_0); - sad += __msa_hadd_u_w(sad3_1, sad3_1); - sad_array[7] = HADD_SW_S32(sad); -} - static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *const aref_ptr[], int32_t ref_stride, int32_t height, @@ -1290,76 +657,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, return sad_64width_msa(src, src_stride, ref, ref_stride, height); \ } -#define AOM_SAD_4xHEIGHTx3_MSA(height) \ - void aom_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_8xHEIGHTx3_MSA(height) \ - void aom_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_16xHEIGHTx3_MSA(height) \ - void aom_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_32xHEIGHTx3_MSA(height) \ - void aom_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_64xHEIGHTx3_MSA(height) \ - void aom_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_4xHEIGHTx8_MSA(height) \ - void aom_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_8xHEIGHTx8_MSA(height) \ - void aom_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_16xHEIGHTx8_MSA(height) \ - void aom_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_32xHEIGHTx8_MSA(height) \ - void aom_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - -#define AOM_SAD_64xHEIGHTx8_MSA(height) \ - void aom_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ - sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ - } - #define AOM_SAD_4xHEIGHTx4D_MSA(height) \ void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *const refs[], \ @@ -1438,92 +735,66 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, /* clang-format off */ // 64x64 AOM_SAD_64xHEIGHT_MSA(64) -AOM_SAD_64xHEIGHTx3_MSA(64) -AOM_SAD_64xHEIGHTx8_MSA(64) AOM_SAD_64xHEIGHTx4D_MSA(64) AOM_AVGSAD_64xHEIGHT_MSA(64) // 64x32 AOM_SAD_64xHEIGHT_MSA(32) -AOM_SAD_64xHEIGHTx3_MSA(32) -AOM_SAD_64xHEIGHTx8_MSA(32) AOM_SAD_64xHEIGHTx4D_MSA(32) AOM_AVGSAD_64xHEIGHT_MSA(32) // 32x64 AOM_SAD_32xHEIGHT_MSA(64) -AOM_SAD_32xHEIGHTx3_MSA(64) -AOM_SAD_32xHEIGHTx8_MSA(64) AOM_SAD_32xHEIGHTx4D_MSA(64) AOM_AVGSAD_32xHEIGHT_MSA(64) // 32x32 AOM_SAD_32xHEIGHT_MSA(32) -AOM_SAD_32xHEIGHTx3_MSA(32) -AOM_SAD_32xHEIGHTx8_MSA(32) AOM_SAD_32xHEIGHTx4D_MSA(32) AOM_AVGSAD_32xHEIGHT_MSA(32) // 32x16 AOM_SAD_32xHEIGHT_MSA(16) -AOM_SAD_32xHEIGHTx3_MSA(16) -AOM_SAD_32xHEIGHTx8_MSA(16) AOM_SAD_32xHEIGHTx4D_MSA(16) AOM_AVGSAD_32xHEIGHT_MSA(16) // 16x32 AOM_SAD_16xHEIGHT_MSA(32) -AOM_SAD_16xHEIGHTx3_MSA(32) -AOM_SAD_16xHEIGHTx8_MSA(32) AOM_SAD_16xHEIGHTx4D_MSA(32) AOM_AVGSAD_16xHEIGHT_MSA(32) // 16x16 AOM_SAD_16xHEIGHT_MSA(16) -AOM_SAD_16xHEIGHTx3_MSA(16) -AOM_SAD_16xHEIGHTx8_MSA(16) AOM_SAD_16xHEIGHTx4D_MSA(16) AOM_AVGSAD_16xHEIGHT_MSA(16) // 16x8 AOM_SAD_16xHEIGHT_MSA(8) -AOM_SAD_16xHEIGHTx3_MSA(8) -AOM_SAD_16xHEIGHTx8_MSA(8) AOM_SAD_16xHEIGHTx4D_MSA(8) AOM_AVGSAD_16xHEIGHT_MSA(8) // 8x16 AOM_SAD_8xHEIGHT_MSA(16) -AOM_SAD_8xHEIGHTx3_MSA(16) -AOM_SAD_8xHEIGHTx8_MSA(16) AOM_SAD_8xHEIGHTx4D_MSA(16) AOM_AVGSAD_8xHEIGHT_MSA(16) // 8x8 AOM_SAD_8xHEIGHT_MSA(8) -AOM_SAD_8xHEIGHTx3_MSA(8) -AOM_SAD_8xHEIGHTx8_MSA(8) AOM_SAD_8xHEIGHTx4D_MSA(8) AOM_AVGSAD_8xHEIGHT_MSA(8) // 8x4 AOM_SAD_8xHEIGHT_MSA(4) -AOM_SAD_8xHEIGHTx3_MSA(4) -AOM_SAD_8xHEIGHTx8_MSA(4) AOM_SAD_8xHEIGHTx4D_MSA(4) AOM_AVGSAD_8xHEIGHT_MSA(4) // 4x8 AOM_SAD_4xHEIGHT_MSA(8) -AOM_SAD_4xHEIGHTx3_MSA(8) -AOM_SAD_4xHEIGHTx8_MSA(8) AOM_SAD_4xHEIGHTx4D_MSA(8) AOM_AVGSAD_4xHEIGHT_MSA(8) // 4x4 AOM_SAD_4xHEIGHT_MSA(4) -AOM_SAD_4xHEIGHTx3_MSA(4) -AOM_SAD_4xHEIGHTx8_MSA(4) AOM_SAD_4xHEIGHTx4D_MSA(4) AOM_AVGSAD_4xHEIGHT_MSA(4) /* clang-format on */ diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c index 3eb85107d..a8ee85b6b 100644 --- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c +++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_ports/mem.h" #include "aom_dsp/mips/macros_msa.h" #include "aom_dsp/variance.h" diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c index 37b89765d..bfed773ac 100644 --- a/third_party/aom/aom_dsp/mips/subtract_msa.c +++ b/third_party/aom/aom_dsp/mips/subtract_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/macros_msa.h" static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, diff --git a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h b/third_party/aom/aom_dsp/mips/txfm_macros_msa.h deleted file mode 100644 index cba5d4445..000000000 --- a/third_party/aom/aom_dsp/mips/txfm_macros_msa.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ -#define AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ - -#include "aom_dsp/mips/macros_msa.h" - -#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ - { \ - v8i16 k0_m = __msa_fill_h(cnst0); \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ - \ - s0_m = (v4i32)__msa_fill_h(cnst1); \ - k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ - \ - ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ - ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ - DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ - SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ - out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ - \ - DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ - SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ - out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ - } - -#define DOT_ADD_SUB_SRARI_PCK(in0, in1, in2, in3, in4, in5, in6, in7, dst0, \ - dst1, dst2, dst3) \ - { \ - v4i32 tp0_m, tp1_m, tp2_m, tp3_m, tp4_m; \ - v4i32 tp5_m, tp6_m, tp7_m, tp8_m, tp9_m; \ - \ - DOTP_SH4_SW(in0, in1, in0, in1, in4, in4, in5, in5, tp0_m, tp2_m, tp3_m, \ - tp4_m); \ - DOTP_SH4_SW(in2, in3, in2, in3, in6, in6, in7, in7, tp5_m, tp6_m, tp7_m, \ - tp8_m); \ - BUTTERFLY_4(tp0_m, tp3_m, tp7_m, tp5_m, tp1_m, tp9_m, tp7_m, tp5_m); \ - BUTTERFLY_4(tp2_m, tp4_m, tp8_m, tp6_m, tp3_m, tp0_m, tp4_m, tp2_m); \ - SRARI_W4_SW(tp1_m, tp9_m, tp7_m, tp5_m, DCT_CONST_BITS); \ - SRARI_W4_SW(tp3_m, tp0_m, tp4_m, tp2_m, DCT_CONST_BITS); \ - PCKEV_H4_SH(tp1_m, tp3_m, tp9_m, tp0_m, tp7_m, tp4_m, tp5_m, tp2_m, dst0, \ - dst1, dst2, dst3); \ - } - -#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2) \ - ({ \ - v8i16 dst_m; \ - v4i32 tp0_m, tp1_m; \ - \ - DOTP_SH2_SW(in0, in1, in2, in2, tp1_m, tp0_m); \ - SRARI_W2_SW(tp1_m, tp0_m, DCT_CONST_BITS); \ - dst_m = __msa_pckev_h((v8i16)tp1_m, (v8i16)tp0_m); \ - \ - dst_m; \ - }) - -#define MADD_SHORT(m0, m1, c0, c1, res0, res1) \ - { \ - v4i32 madd0_m, madd1_m, madd2_m, madd3_m; \ - v8i16 madd_s0_m, madd_s1_m; \ - \ - ILVRL_H2_SH(m1, m0, madd_s0_m, madd_s1_m); \ - DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s0_m, madd_s1_m, c0, c0, c1, c1, \ - madd0_m, madd1_m, madd2_m, madd3_m); \ - SRARI_W4_SW(madd0_m, madd1_m, madd2_m, madd3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(madd1_m, madd0_m, madd3_m, madd2_m, res0, res1); \ - } - -#define MADD_BF(inp0, inp1, inp2, inp3, cst0, cst1, cst2, cst3, out0, out1, \ - out2, out3) \ - { \ - v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m, m4_m, m5_m; \ - \ - ILVRL_H2_SH(inp1, inp0, madd_s0_m, madd_s1_m); \ - ILVRL_H2_SH(inp3, inp2, madd_s2_m, madd_s3_m); \ - DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst0, cst0, cst2, \ - cst2, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ - SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1); \ - DOTP_SH4_SW(madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m, cst1, cst1, cst3, \ - cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - BUTTERFLY_4(tmp0_m, tmp1_m, tmp3_m, tmp2_m, m4_m, m5_m, tmp3_m, tmp2_m); \ - SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3); \ - } -#endif // AOM_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c index 745fdfc9c..065c09ac5 100644 --- a/third_party/aom/aom_dsp/mips/variance_msa.c +++ b/third_party/aom/aom_dsp/mips/variance_msa.c @@ -9,7 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/mips/macros_msa.h" #define CALC_MSE_B(src, ref, var) \ diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c new file mode 100644 index 000000000..a1287f74f --- /dev/null +++ b/third_party/aom/aom_dsp/noise_model.c @@ -0,0 +1,1460 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/noise_model.h" +#include "aom_dsp/noise_util.h" +#include "aom_mem/aom_mem.h" +#include "av1/common/common.h" +#include "av1/encoder/mathutils.h" + +#define kLowPolyNumParams 3 + +static const int kMaxLag = 4; + +// Defines a function that can be used to obtain the mean of a block for the +// provided data type (uint8_t, or uint16_t) +#define GET_BLOCK_MEAN(INT_TYPE, suffix) \ + static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \ + int stride, int x_o, int y_o, \ + int block_size) { \ + const int max_h = AOMMIN(h - y_o, block_size); \ + const int max_w = AOMMIN(w - x_o, block_size); \ + double block_mean = 0; \ + for (int y = 0; y < max_h; ++y) { \ + for (int x = 0; x < max_w; ++x) { \ + block_mean += data[(y_o + y) * stride + x_o + x]; \ + } \ + } \ + return block_mean / (max_w * max_h); \ + } + +GET_BLOCK_MEAN(uint8_t, lowbd); +GET_BLOCK_MEAN(uint16_t, highbd); + +static INLINE double get_block_mean(const uint8_t *data, int w, int h, + int stride, int x_o, int y_o, + int block_size, int use_highbd) { + if (use_highbd) + return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o, + block_size); + return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size); +} + +// Defines a function that can be used to obtain the variance of a block +// for the provided data type (uint8_t, or uint16_t) +#define GET_NOISE_VAR(INT_TYPE, suffix) \ + static double get_noise_var_##suffix( \ + const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \ + int h, int x_o, int y_o, int block_size_x, int block_size_y) { \ + const int max_h = AOMMIN(h - y_o, block_size_y); \ + const int max_w = AOMMIN(w - x_o, block_size_x); \ + double noise_var = 0; \ + double noise_mean = 0; \ + for (int y = 0; y < max_h; ++y) { \ + for (int x = 0; x < max_w; ++x) { \ + double noise = (double)data[(y_o + y) * stride + x_o + x] - \ + denoised[(y_o + y) * stride + x_o + x]; \ + noise_mean += noise; \ + noise_var += noise * noise; \ + } \ + } \ + noise_mean /= (max_w * max_h); \ + return noise_var / (max_w * max_h) - noise_mean * noise_mean; \ + } + +GET_NOISE_VAR(uint8_t, lowbd); +GET_NOISE_VAR(uint16_t, highbd); + +static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised, + int w, int h, int stride, int x_o, int y_o, + int block_size_x, int block_size_y, + int use_highbd) { + if (use_highbd) + return get_noise_var_highbd((const uint16_t *)data, + (const uint16_t *)denoised, w, h, stride, x_o, + y_o, block_size_x, block_size_y); + return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o, + block_size_x, block_size_y); +} + +static void equation_system_clear(aom_equation_system_t *eqns) { + const int n = eqns->n; + memset(eqns->A, 0, sizeof(*eqns->A) * n * n); + memset(eqns->x, 0, sizeof(*eqns->x) * n); + memset(eqns->b, 0, sizeof(*eqns->b) * n); +} + +static void equation_system_copy(aom_equation_system_t *dst, + const aom_equation_system_t *src) { + const int n = dst->n; + memcpy(dst->A, src->A, sizeof(*dst->A) * n * n); + memcpy(dst->x, src->x, sizeof(*dst->x) * n); + memcpy(dst->b, src->b, sizeof(*dst->b) * n); +} + +static int equation_system_init(aom_equation_system_t *eqns, int n) { + eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n); + eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n); + eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n); + eqns->n = n; + if (!eqns->A || !eqns->b || !eqns->x) { + fprintf(stderr, "Failed to allocate system of equations of size %d\n", n); + aom_free(eqns->A); + aom_free(eqns->b); + aom_free(eqns->x); + memset(eqns, 0, sizeof(*eqns)); + return 0; + } + equation_system_clear(eqns); + return 1; +} + +static int equation_system_solve(aom_equation_system_t *eqns) { + const int n = eqns->n; + double *b = (double *)aom_malloc(sizeof(*b) * n); + double *A = (double *)aom_malloc(sizeof(*A) * n * n); + int ret = 0; + if (A == NULL || b == NULL) { + fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n); + aom_free(b); + aom_free(A); + return 0; + } + memcpy(A, eqns->A, sizeof(*eqns->A) * n * n); + memcpy(b, eqns->b, sizeof(*eqns->b) * n); + ret = linsolve(n, A, eqns->n, b, eqns->x); + aom_free(b); + aom_free(A); + + if (ret == 0) { + return 0; + } + return 1; +} + +static void equation_system_add(aom_equation_system_t *dest, + aom_equation_system_t *src) { + const int n = dest->n; + int i, j; + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) { + dest->A[i * n + j] += src->A[i * n + j]; + } + dest->b[i] += src->b[i]; + } +} + +static void equation_system_free(aom_equation_system_t *eqns) { + if (!eqns) return; + aom_free(eqns->A); + aom_free(eqns->b); + aom_free(eqns->x); + memset(eqns, 0, sizeof(*eqns)); +} + +static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) { + equation_system_clear(&solver->eqns); + solver->num_equations = 0; + solver->total = 0; +} + +static void noise_strength_solver_add(aom_noise_strength_solver_t *dest, + aom_noise_strength_solver_t *src) { + equation_system_add(&dest->eqns, &src->eqns); + dest->num_equations += src->num_equations; + dest->total += src->total; +} + +// Return the number of coefficients required for the given parameters +static int num_coeffs(const aom_noise_model_params_t params) { + const int n = 2 * params.lag + 1; + switch (params.shape) { + case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1); + case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2; + } + return 0; +} + +static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) { + const int kNumBins = 20; + if (!equation_system_init(&state->eqns, n)) { + fprintf(stderr, "Failed initialization noise state with size %d\n", n); + return 0; + } + state->ar_gain = 1.0; + state->num_observations = 0; + return aom_noise_strength_solver_init(&state->strength_solver, kNumBins, + bit_depth); +} + +static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) { + const double kTolerance = 1e-6; + const int last = eqns->n - 1; + // Set all of the AR coefficients to zero, but try to solve for correlation + // with the luma channel + memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n); + if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) { + eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last]; + } +} + +int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) { + if (!lut) return 0; + lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points)); + if (!lut->points) return 0; + lut->num_points = num_points; + memset(lut->points, 0, sizeof(*lut->points) * num_points); + return 1; +} + +void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) { + if (!lut) return; + aom_free(lut->points); + memset(lut, 0, sizeof(*lut)); +} + +double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut, + double x) { + int i = 0; + // Constant extrapolation for x < x_0. + if (x < lut->points[0][0]) return lut->points[0][1]; + for (i = 0; i < lut->num_points - 1; ++i) { + if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) { + const double a = + (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]); + return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a); + } + } + // Constant extrapolation for x > x_{n-1} + return lut->points[lut->num_points - 1][1]; +} + +static double noise_strength_solver_get_bin_index( + const aom_noise_strength_solver_t *solver, double value) { + const double val = + fclamp(value, solver->min_intensity, solver->max_intensity); + const double range = solver->max_intensity - solver->min_intensity; + return (solver->num_bins - 1) * (val - solver->min_intensity) / range; +} + +static double noise_strength_solver_get_value( + const aom_noise_strength_solver_t *solver, double x) { + const double bin = noise_strength_solver_get_bin_index(solver, x); + const int bin_i0 = (int)floor(bin); + const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1); + const double a = bin - bin_i0; + return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1]; +} + +void aom_noise_strength_solver_add_measurement( + aom_noise_strength_solver_t *solver, double block_mean, double noise_std) { + const double bin = noise_strength_solver_get_bin_index(solver, block_mean); + const int bin_i0 = (int)floor(bin); + const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1); + const double a = bin - bin_i0; + const int n = solver->num_bins; + solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a); + solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a); + solver->eqns.A[bin_i1 * n + bin_i1] += a * a; + solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a); + solver->eqns.b[bin_i0] += (1.0 - a) * noise_std; + solver->eqns.b[bin_i1] += a * noise_std; + solver->total += noise_std; + solver->num_equations++; +} + +int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) { + // Add regularization proportional to the number of constraints + const int n = solver->num_bins; + const double kAlpha = 2.0 * (double)(solver->num_equations) / n; + int result = 0; + double mean = 0; + + // Do this in a non-destructive manner so it is not confusing to the caller + double *old_A = solver->eqns.A; + double *A = (double *)aom_malloc(sizeof(*A) * n * n); + if (!A) { + fprintf(stderr, "Unable to allocate copy of A\n"); + return 0; + } + memcpy(A, old_A, sizeof(*A) * n * n); + + for (int i = 0; i < n; ++i) { + const int i_lo = AOMMAX(0, i - 1); + const int i_hi = AOMMIN(n - 1, i + 1); + A[i * n + i_lo] -= kAlpha; + A[i * n + i] += 2 * kAlpha; + A[i * n + i_hi] -= kAlpha; + } + + // Small regularization to give average noise strength + mean = solver->total / solver->num_equations; + for (int i = 0; i < n; ++i) { + A[i * n + i] += 1.0 / 8192.; + solver->eqns.b[i] += mean / 8192.; + } + solver->eqns.A = A; + result = equation_system_solve(&solver->eqns); + solver->eqns.A = old_A; + + aom_free(A); + return result; +} + +int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver, + int num_bins, int bit_depth) { + if (!solver) return 0; + memset(solver, 0, sizeof(*solver)); + solver->num_bins = num_bins; + solver->min_intensity = 0; + solver->max_intensity = (1 << bit_depth) - 1; + solver->total = 0; + solver->num_equations = 0; + return equation_system_init(&solver->eqns, num_bins); +} + +void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) { + if (!solver) return; + equation_system_free(&solver->eqns); +} + +double aom_noise_strength_solver_get_center( + const aom_noise_strength_solver_t *solver, int i) { + const double range = solver->max_intensity - solver->min_intensity; + const int n = solver->num_bins; + return ((double)i) / (n - 1) * range + solver->min_intensity; +} + +// Computes the residual if a point were to be removed from the lut. This is +// calculated as the area between the output of the solver and the line segment +// that would be formed between [x_{i - 1}, x_{i + 1}). +static void update_piecewise_linear_residual( + const aom_noise_strength_solver_t *solver, + const aom_noise_strength_lut_t *lut, double *residual, int start, int end) { + const double dx = 255. / solver->num_bins; + for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) { + const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index( + solver, lut->points[i - 1][0]))); + const int upper = AOMMIN(solver->num_bins - 1, + (int)ceil(noise_strength_solver_get_bin_index( + solver, lut->points[i + 1][0]))); + double r = 0; + for (int j = lower; j <= upper; ++j) { + const double x = aom_noise_strength_solver_get_center(solver, j); + if (x < lut->points[i - 1][0]) continue; + if (x >= lut->points[i + 1][0]) continue; + const double y = solver->eqns.x[j]; + const double a = (x - lut->points[i - 1][0]) / + (lut->points[i + 1][0] - lut->points[i - 1][0]); + const double estimate_y = + lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a; + r += fabs(y - estimate_y); + } + residual[i] = r * dx; + } +} + +int aom_noise_strength_solver_fit_piecewise( + const aom_noise_strength_solver_t *solver, int max_output_points, + aom_noise_strength_lut_t *lut) { + // The tolerance is normalized to be give consistent results between + // different bit-depths. + const double kTolerance = solver->max_intensity * 0.00625 / 255.0; + if (!aom_noise_strength_lut_init(lut, solver->num_bins)) { + fprintf(stderr, "Failed to init lut\n"); + return 0; + } + for (int i = 0; i < solver->num_bins; ++i) { + lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i); + lut->points[i][1] = solver->eqns.x[i]; + } + if (max_output_points < 0) { + max_output_points = solver->num_bins; + } + + double *residual = aom_malloc(solver->num_bins * sizeof(*residual)); + memset(residual, 0, sizeof(*residual) * solver->num_bins); + + update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins); + + // Greedily remove points if there are too many or if it doesn't hurt local + // approximation (never remove the end points) + while (lut->num_points > 2) { + int min_index = 1; + for (int j = 1; j < lut->num_points - 1; ++j) { + if (residual[j] < residual[min_index]) { + min_index = j; + } + } + const double dx = + lut->points[min_index + 1][0] - lut->points[min_index - 1][0]; + const double avg_residual = residual[min_index] / dx; + if (lut->num_points <= max_output_points && avg_residual > kTolerance) { + break; + } + + const int num_remaining = lut->num_points - min_index - 1; + memmove(lut->points + min_index, lut->points + min_index + 1, + sizeof(lut->points[0]) * num_remaining); + lut->num_points--; + + update_piecewise_linear_residual(solver, lut, residual, min_index - 1, + min_index + 1); + } + aom_free(residual); + return 1; +} + +int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder, + int block_size, int bit_depth, int use_highbd) { + const int n = block_size * block_size; + aom_equation_system_t eqns; + double *AtA_inv = 0; + double *A = 0; + int x = 0, y = 0, i = 0, j = 0; + if (!equation_system_init(&eqns, kLowPolyNumParams)) { + fprintf(stderr, "Failed to init equation system for block_size=%d\n", + block_size); + return 0; + } + + AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams * + sizeof(*AtA_inv)); + A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A)); + if (AtA_inv == NULL || A == NULL) { + fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n", + block_size); + aom_free(AtA_inv); + aom_free(A); + equation_system_free(&eqns); + return 0; + } + + block_finder->A = A; + block_finder->AtA_inv = AtA_inv; + block_finder->block_size = block_size; + block_finder->normalization = (1 << bit_depth) - 1; + block_finder->use_highbd = use_highbd; + + for (y = 0; y < block_size; ++y) { + const double yd = ((double)y - block_size / 2.) / (block_size / 2.); + for (x = 0; x < block_size; ++x) { + const double xd = ((double)x - block_size / 2.) / (block_size / 2.); + const double coords[3] = { yd, xd, 1 }; + const int row = y * block_size + x; + A[kLowPolyNumParams * row + 0] = yd; + A[kLowPolyNumParams * row + 1] = xd; + A[kLowPolyNumParams * row + 2] = 1; + + for (i = 0; i < kLowPolyNumParams; ++i) { + for (j = 0; j < kLowPolyNumParams; ++j) { + eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j]; + } + } + } + } + + // Lazy inverse using existing equation solver. + for (i = 0; i < kLowPolyNumParams; ++i) { + memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams); + eqns.b[i] = 1; + equation_system_solve(&eqns); + + for (j = 0; j < kLowPolyNumParams; ++j) { + AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j]; + } + } + equation_system_free(&eqns); + return 1; +} + +void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) { + if (!block_finder) return; + aom_free(block_finder->A); + aom_free(block_finder->AtA_inv); + memset(block_finder, 0, sizeof(*block_finder)); +} + +void aom_flat_block_finder_extract_block( + const aom_flat_block_finder_t *block_finder, const uint8_t *const data, + int w, int h, int stride, int offsx, int offsy, double *plane, + double *block) { + const int block_size = block_finder->block_size; + const int n = block_size * block_size; + const double *A = block_finder->A; + const double *AtA_inv = block_finder->AtA_inv; + double plane_coords[kLowPolyNumParams]; + double AtA_inv_b[kLowPolyNumParams]; + int xi, yi, i; + + if (block_finder->use_highbd) { + const uint16_t *const data16 = (const uint16_t *const)data; + for (yi = 0; yi < block_size; ++yi) { + const int y = clamp(offsy + yi, 0, h - 1); + for (xi = 0; xi < block_size; ++xi) { + const int x = clamp(offsx + xi, 0, w - 1); + block[yi * block_size + xi] = + ((double)data16[y * stride + x]) / block_finder->normalization; + } + } + } else { + for (yi = 0; yi < block_size; ++yi) { + const int y = clamp(offsy + yi, 0, h - 1); + for (xi = 0; xi < block_size; ++xi) { + const int x = clamp(offsx + xi, 0, w - 1); + block[yi * block_size + xi] = + ((double)data[y * stride + x]) / block_finder->normalization; + } + } + } + multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams); + multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams, + kLowPolyNumParams, 1); + multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1); + + for (i = 0; i < n; ++i) { + block[i] -= plane[i]; + } +} + +typedef struct { + int index; + float score; +} index_and_score_t; + +static int compare_scores(const void *a, const void *b) { + const float diff = + ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score; + if (diff < 0) + return -1; + else if (diff > 0) + return 1; + return 0; +} + +int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, + const uint8_t *const data, int w, int h, + int stride, uint8_t *flat_blocks) { + // The gradient-based features used in this code are based on: + // A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise + // correlation for improved video denoising," 2012 19th, ICIP. + // The thresholds are more lenient to allow for correct grain modeling + // if extreme cases. + const int block_size = block_finder->block_size; + const int n = block_size * block_size; + const double kTraceThreshold = 0.15 / (32 * 32); + const double kRatioThreshold = 1.25; + const double kNormThreshold = 0.08 / (32 * 32); + const double kVarThreshold = 0.005 / (double)n; + const int num_blocks_w = (w + block_size - 1) / block_size; + const int num_blocks_h = (h + block_size - 1) / block_size; + int num_flat = 0; + int bx = 0, by = 0; + double *plane = (double *)aom_malloc(n * sizeof(*plane)); + double *block = (double *)aom_malloc(n * sizeof(*block)); + index_and_score_t *scores = (index_and_score_t *)aom_malloc( + num_blocks_w * num_blocks_h * sizeof(*scores)); + if (plane == NULL || block == NULL || scores == NULL) { + fprintf(stderr, "Failed to allocate memory for block of size %d\n", n); + aom_free(plane); + aom_free(block); + aom_free(scores); + return -1; + } + +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "score = ["); +#endif + for (by = 0; by < num_blocks_h; ++by) { + for (bx = 0; bx < num_blocks_w; ++bx) { + // Compute gradient covariance matrix. + double Gxx = 0, Gxy = 0, Gyy = 0; + double var = 0; + double mean = 0; + int xi, yi; + aom_flat_block_finder_extract_block(block_finder, data, w, h, stride, + bx * block_size, by * block_size, + plane, block); + + for (yi = 1; yi < block_size - 1; ++yi) { + for (xi = 1; xi < block_size - 1; ++xi) { + const double gx = (block[yi * block_size + xi + 1] - + block[yi * block_size + xi - 1]) / + 2; + const double gy = (block[yi * block_size + xi + block_size] - + block[yi * block_size + xi - block_size]) / + 2; + Gxx += gx * gx; + Gxy += gx * gy; + Gyy += gy * gy; + + mean += block[yi * block_size + xi]; + var += block[yi * block_size + xi] * block[yi * block_size + xi]; + } + } + mean /= (block_size - 2) * (block_size - 2); + + // Normalize gradients by block_size. + Gxx /= ((block_size - 2) * (block_size - 2)); + Gxy /= ((block_size - 2) * (block_size - 2)); + Gyy /= ((block_size - 2) * (block_size - 2)); + var = var / ((block_size - 2) * (block_size - 2)) - mean * mean; + + { + const double trace = Gxx + Gyy; + const double det = Gxx * Gyy - Gxy * Gxy; + const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.; + const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.; + const double norm = e1; // Spectral norm + const double ratio = (e1 / AOMMAX(e2, 1e-6)); + const int is_flat = (trace < kTraceThreshold) && + (ratio < kRatioThreshold) && + (norm < kNormThreshold) && (var > kVarThreshold); + // The following weights are used to combine the above features to give + // a sigmoid score for flatness. If the input was normalized to [0,100] + // the magnitude of these values would be close to 1 (e.g., weights + // corresponding to variance would be a factor of 10000x smaller). + // The weights are given in the following order: + // [{var}, {ratio}, {trace}, {norm}, offset] + // with one of the most discriminative being simply the variance. + const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 }; + const float score = + (float)(1.0 / (1 + exp(-(weights[0] * var + weights[1] * ratio + + weights[2] * trace + weights[3] * norm + + weights[4])))); + flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0; + scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0; + scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx; +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm, + is_flat); +#endif + num_flat += is_flat; + } + } +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "\n"); +#endif + } +#ifdef NOISE_MODEL_LOG_SCORE + fprintf(stderr, "];\n"); +#endif + // Find the top-scored blocks (most likely to be flat) and set the flat blocks + // be the union of the thresholded results and the top 10th percentile of the + // scored results. + qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores); + const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100; + const float score_threshold = scores[top_nth_percentile].score; + for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) { + if (scores[i].score >= score_threshold) { + num_flat += flat_blocks[scores[i].index] == 0; + flat_blocks[scores[i].index] |= 1; + } + } + aom_free(block); + aom_free(plane); + aom_free(scores); + return num_flat; +} + +int aom_noise_model_init(aom_noise_model_t *model, + const aom_noise_model_params_t params) { + const int n = num_coeffs(params); + const int lag = params.lag; + const int bit_depth = params.bit_depth; + int x = 0, y = 0, i = 0, c = 0; + + memset(model, 0, sizeof(*model)); + if (params.lag < 1) { + fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag); + return 0; + } + if (params.lag > kMaxLag) { + fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag, + kMaxLag); + return 0; + } + + memcpy(&model->params, ¶ms, sizeof(params)); + for (c = 0; c < 3; ++c) { + if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) { + fprintf(stderr, "Failed to allocate noise state for channel %d\n", c); + aom_noise_model_free(model); + return 0; + } + if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) { + fprintf(stderr, "Failed to allocate noise state for channel %d\n", c); + aom_noise_model_free(model); + return 0; + } + } + model->n = n; + model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n); + + for (y = -lag; y <= 0; ++y) { + const int max_x = y == 0 ? -1 : lag; + for (x = -lag; x <= max_x; ++x) { + switch (params.shape) { + case AOM_NOISE_SHAPE_DIAMOND: + if (abs(x) <= y + lag) { + model->coords[i][0] = x; + model->coords[i][1] = y; + ++i; + } + break; + case AOM_NOISE_SHAPE_SQUARE: + model->coords[i][0] = x; + model->coords[i][1] = y; + ++i; + break; + default: + fprintf(stderr, "Invalid shape\n"); + aom_noise_model_free(model); + return 0; + } + } + } + assert(i == n); + return 1; +} + +void aom_noise_model_free(aom_noise_model_t *model) { + int c = 0; + if (!model) return; + + aom_free(model->coords); + for (c = 0; c < 3; ++c) { + equation_system_free(&model->latest_state[c].eqns); + equation_system_free(&model->combined_state[c].eqns); + + equation_system_free(&model->latest_state[c].strength_solver.eqns); + equation_system_free(&model->combined_state[c].strength_solver.eqns); + } + memset(model, 0, sizeof(*model)); +} + +// Extracts the neighborhood defined by coords around point (x, y) from +// the difference between the data and denoised images. Also extracts the +// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma). +#define EXTRACT_AR_ROW(INT_TYPE, suffix) \ + static double extract_ar_row_##suffix( \ + int(*coords)[2], int num_coords, const INT_TYPE *const data, \ + const INT_TYPE *const denoised, int stride, int sub_log2[2], \ + const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised, \ + int alt_stride, int x, int y, double *buffer) { \ + for (int i = 0; i < num_coords; ++i) { \ + const int x_i = x + coords[i][0], y_i = y + coords[i][1]; \ + buffer[i] = \ + (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \ + } \ + const double val = \ + (double)data[y * stride + x] - denoised[y * stride + x]; \ + \ + if (alt_data && alt_denoised) { \ + double avg_data = 0, avg_denoised = 0; \ + int num_samples = 0; \ + for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) { \ + const int y_up = (y << sub_log2[1]) + dy_i; \ + for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) { \ + const int x_up = (x << sub_log2[0]) + dx_i; \ + avg_data += alt_data[y_up * alt_stride + x_up]; \ + avg_denoised += alt_denoised[y_up * alt_stride + x_up]; \ + num_samples++; \ + } \ + } \ + buffer[num_coords] = (avg_data - avg_denoised) / num_samples; \ + } \ + return val; \ + } + +EXTRACT_AR_ROW(uint8_t, lowbd); +EXTRACT_AR_ROW(uint16_t, highbd); + +static int add_block_observations( + aom_noise_model_t *noise_model, int c, const uint8_t *const data, + const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2], + const uint8_t *const alt_data, const uint8_t *const alt_denoised, + int alt_stride, const uint8_t *const flat_blocks, int block_size, + int num_blocks_w, int num_blocks_h) { + const int lag = noise_model->params.lag; + const int num_coords = noise_model->n; + const double normalization = (1 << noise_model->params.bit_depth) - 1; + double *A = noise_model->latest_state[c].eqns.A; + double *b = noise_model->latest_state[c].eqns.b; + double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1)); + const int n = noise_model->latest_state[c].eqns.n; + + if (!buffer) { + fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1); + return 0; + } + for (int by = 0; by < num_blocks_h; ++by) { + const int y_o = by * (block_size >> sub_log2[1]); + for (int bx = 0; bx < num_blocks_w; ++bx) { + const int x_o = bx * (block_size >> sub_log2[0]); + if (!flat_blocks[by * num_blocks_w + bx]) { + continue; + } + int y_start = + (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag; + int x_start = + (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag; + int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]), + block_size >> sub_log2[1]); + int x_end = AOMMIN( + (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag, + (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1]) + ? (block_size >> sub_log2[0]) + : ((block_size >> sub_log2[0]) - lag)); + for (int y = y_start; y < y_end; ++y) { + for (int x = x_start; x < x_end; ++x) { + const double val = + noise_model->params.use_highbd + ? extract_ar_row_highbd(noise_model->coords, num_coords, + (const uint16_t *const)data, + (const uint16_t *const)denoised, + stride, sub_log2, + (const uint16_t *const)alt_data, + (const uint16_t *const)alt_denoised, + alt_stride, x + x_o, y + y_o, buffer) + : extract_ar_row_lowbd(noise_model->coords, num_coords, data, + denoised, stride, sub_log2, alt_data, + alt_denoised, alt_stride, x + x_o, + y + y_o, buffer); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + A[i * n + j] += + (buffer[i] * buffer[j]) / (normalization * normalization); + } + b[i] += (buffer[i] * val) / (normalization * normalization); + } + noise_model->latest_state[c].num_observations++; + } + } + } + } + aom_free(buffer); + return 1; +} + +static void add_noise_std_observations( + aom_noise_model_t *noise_model, int c, const double *coeffs, + const uint8_t *const data, const uint8_t *const denoised, int w, int h, + int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride, + const uint8_t *const flat_blocks, int block_size, int num_blocks_w, + int num_blocks_h) { + const int num_coords = noise_model->n; + aom_noise_strength_solver_t *noise_strength_solver = + &noise_model->latest_state[c].strength_solver; + + const aom_noise_strength_solver_t *noise_strength_luma = + &noise_model->latest_state[0].strength_solver; + const double luma_gain = noise_model->latest_state[0].ar_gain; + const double noise_gain = noise_model->latest_state[c].ar_gain; + for (int by = 0; by < num_blocks_h; ++by) { + const int y_o = by * (block_size >> sub_log2[1]); + for (int bx = 0; bx < num_blocks_w; ++bx) { + const int x_o = bx * (block_size >> sub_log2[0]); + if (!flat_blocks[by * num_blocks_w + bx]) { + continue; + } + const int num_samples_h = + AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]), + block_size >> sub_log2[1]); + const int num_samples_w = + AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]), + (block_size >> sub_log2[0])); + // Make sure that we have a reasonable amount of samples to consider the + // block + if (num_samples_w * num_samples_h > block_size) { + const double block_mean = get_block_mean( + alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride, + x_o << sub_log2[0], y_o << sub_log2[1], block_size, + noise_model->params.use_highbd); + const double noise_var = get_noise_var( + data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o, + y_o, block_size >> sub_log2[0], block_size >> sub_log2[1], + noise_model->params.use_highbd); + // We want to remove the part of the noise that came from being + // correlated with luma. Note that the noise solver for luma must + // have already been run. + const double luma_strength = + c > 0 ? luma_gain * noise_strength_solver_get_value( + noise_strength_luma, block_mean) + : 0; + const double corr = c > 0 ? coeffs[num_coords] : 0; + // Chroma noise: + // N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2) + // The uncorrelated component: + // uncorr_var = noise_var - (corr * luma_strength)^2 + // But don't allow fully correlated noise (hence the max), since the + // synthesis cannot model it. + const double uncorr_std = sqrt( + AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2))); + // After we've removed correlation with luma, undo the gain that will + // come from running the IIR filter. + const double adjusted_strength = uncorr_std / noise_gain; + aom_noise_strength_solver_add_measurement( + noise_strength_solver, block_mean, adjusted_strength); + } + } + } +} + +// Return true if the noise estimate appears to be different from the combined +// (multi-frame) estimate. The difference is measured by checking whether the +// AR coefficients have diverged (using a threshold on normalized cross +// correlation), or whether the noise strength has changed. +static int is_noise_model_different(aom_noise_model_t *const noise_model) { + // These thresholds are kind of arbitrary and will likely need further tuning + // (or exported as parameters). The threshold on noise strength is a weighted + // difference between the noise strength histograms + const double kCoeffThreshold = 0.9; + const double kStrengthThreshold = + 0.005 * (1 << (noise_model->params.bit_depth - 8)); + for (int c = 0; c < 1; ++c) { + const double corr = + aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x, + noise_model->combined_state[c].eqns.x, + noise_model->combined_state[c].eqns.n); + if (corr < kCoeffThreshold) return 1; + + const double dx = + 1.0 / noise_model->latest_state[c].strength_solver.num_bins; + + const aom_equation_system_t *latest_eqns = + &noise_model->latest_state[c].strength_solver.eqns; + const aom_equation_system_t *combined_eqns = + &noise_model->combined_state[c].strength_solver.eqns; + double diff = 0; + double total_weight = 0; + for (int j = 0; j < latest_eqns->n; ++j) { + double weight = 0; + for (int i = 0; i < latest_eqns->n; ++i) { + weight += latest_eqns->A[i * latest_eqns->n + j]; + } + weight = sqrt(weight); + diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]); + total_weight += weight; + } + if (diff * dx / total_weight > kStrengthThreshold) return 1; + } + return 0; +} + +static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) { + const int ret = equation_system_solve(&state->eqns); + state->ar_gain = 1.0; + if (!ret) return ret; + + // Update the AR gain from the equation system as it will be used to fit + // the noise strength as a function of intensity. In the Yule-Walker + // equations, the diagonal should be the variance of the correlated noise. + // In the case of the least squares estimate, there will be some variability + // in the diagonal. So use the mean of the diagonal as the estimate of + // overall variance (this works for least squares or Yule-Walker formulation). + double var = 0; + const int n = state->eqns.n; + for (int i = 0; i < (state->eqns.n - is_chroma); ++i) { + var += state->eqns.A[i * n + i] / state->num_observations; + } + var /= (n - is_chroma); + + // Keep track of E(Y^2) = + E(X^2) + // In the case that we are using chroma and have an estimate of correlation + // with luma we adjust that estimate slightly to remove the correlated bits by + // subtracting out the last column of a scaled by our correlation estimate + // from b. E(y^2) = + double sum_covar = 0; + for (int i = 0; i < state->eqns.n - is_chroma; ++i) { + double bi = state->eqns.b[i]; + if (is_chroma) { + bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1]; + } + sum_covar += (bi * state->eqns.x[i]) / state->num_observations; + } + // Now, get an estimate of the variance of uncorrelated noise signal and use + // it to determine the gain of the AR filter. + const double noise_var = AOMMAX(var - sum_covar, 1e-6); + state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6))); + return ret; +} + +aom_noise_status_t aom_noise_model_update( + aom_noise_model_t *const noise_model, const uint8_t *const data[3], + const uint8_t *const denoised[3], int w, int h, int stride[3], + int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) { + const int num_blocks_w = (w + block_size - 1) / block_size; + const int num_blocks_h = (h + block_size - 1) / block_size; + int y_model_different = 0; + int num_blocks = 0; + int i = 0, channel = 0; + + if (block_size <= 1) { + fprintf(stderr, "block_size = %d must be > 1\n", block_size); + return AOM_NOISE_STATUS_INVALID_ARGUMENT; + } + + if (block_size < noise_model->params.lag * 2 + 1) { + fprintf(stderr, "block_size = %d must be >= %d\n", block_size, + noise_model->params.lag * 2 + 1); + return AOM_NOISE_STATUS_INVALID_ARGUMENT; + } + + // Clear the latest equation system + for (i = 0; i < 3; ++i) { + equation_system_clear(&noise_model->latest_state[i].eqns); + noise_model->latest_state[i].num_observations = 0; + noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver); + } + + // Check that we have enough flat blocks + for (i = 0; i < num_blocks_h * num_blocks_w; ++i) { + if (flat_blocks[i]) { + num_blocks++; + } + } + + if (num_blocks <= 1) { + fprintf(stderr, "Not enough flat blocks to update noise estimate\n"); + return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS; + } + + for (channel = 0; channel < 3; ++channel) { + int no_subsampling[2] = { 0, 0 }; + const uint8_t *alt_data = channel > 0 ? data[0] : 0; + const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0; + int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling; + const int is_chroma = channel != 0; + if (!data[channel] || !denoised[channel]) break; + if (!add_block_observations(noise_model, channel, data[channel], + denoised[channel], w, h, stride[channel], sub, + alt_data, alt_denoised, stride[0], flat_blocks, + block_size, num_blocks_w, num_blocks_h)) { + fprintf(stderr, "Adding block observation failed\n"); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + + if (!ar_equation_system_solve(&noise_model->latest_state[channel], + is_chroma)) { + if (is_chroma) { + set_chroma_coefficient_fallback_soln( + &noise_model->latest_state[channel].eqns); + } else { + fprintf(stderr, "Solving latest noise equation system failed %d!\n", + channel); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + } + + add_noise_std_observations( + noise_model, channel, noise_model->latest_state[channel].eqns.x, + data[channel], denoised[channel], w, h, stride[channel], sub, alt_data, + stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h); + + if (!aom_noise_strength_solver_solve( + &noise_model->latest_state[channel].strength_solver)) { + fprintf(stderr, "Solving latest noise strength failed!\n"); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + + // Check noise characteristics and return if error. + if (channel == 0 && + noise_model->combined_state[channel].strength_solver.num_equations > + 0 && + is_noise_model_different(noise_model)) { + y_model_different = 1; + } + + // Don't update the combined stats if the y model is different. + if (y_model_different) continue; + + noise_model->combined_state[channel].num_observations += + noise_model->latest_state[channel].num_observations; + equation_system_add(&noise_model->combined_state[channel].eqns, + &noise_model->latest_state[channel].eqns); + if (!ar_equation_system_solve(&noise_model->combined_state[channel], + is_chroma)) { + if (is_chroma) { + set_chroma_coefficient_fallback_soln( + &noise_model->combined_state[channel].eqns); + } else { + fprintf(stderr, "Solving combined noise equation system failed %d!\n", + channel); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + } + + noise_strength_solver_add( + &noise_model->combined_state[channel].strength_solver, + &noise_model->latest_state[channel].strength_solver); + + if (!aom_noise_strength_solver_solve( + &noise_model->combined_state[channel].strength_solver)) { + fprintf(stderr, "Solving combined noise strength failed!\n"); + return AOM_NOISE_STATUS_INTERNAL_ERROR; + } + } + + return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE + : AOM_NOISE_STATUS_OK; +} + +void aom_noise_model_save_latest(aom_noise_model_t *noise_model) { + for (int c = 0; c < 3; c++) { + equation_system_copy(&noise_model->combined_state[c].eqns, + &noise_model->latest_state[c].eqns); + equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns, + &noise_model->latest_state[c].strength_solver.eqns); + noise_model->combined_state[c].strength_solver.num_equations = + noise_model->latest_state[c].strength_solver.num_equations; + noise_model->combined_state[c].num_observations = + noise_model->latest_state[c].num_observations; + noise_model->combined_state[c].ar_gain = + noise_model->latest_state[c].ar_gain; + } +} + +int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, + aom_film_grain_t *film_grain) { + if (noise_model->params.lag > 3) { + fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag); + return 0; + } + memset(film_grain, 0, sizeof(*film_grain)); + + film_grain->apply_grain = 1; + film_grain->update_parameters = 1; + + film_grain->ar_coeff_lag = noise_model->params.lag; + + // Convert the scaling functions to 8 bit values + aom_noise_strength_lut_t scaling_points[3]; + aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0); + aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1); + aom_noise_strength_solver_fit_piecewise( + &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2); + + // Both the domain and the range of the scaling functions in the film_grain + // are normalized to 8-bit (e.g., they are implicitly scaled during grain + // synthesis). + const double strength_divisor = 1 << (noise_model->params.bit_depth - 8); + double max_scaling_value = 1e-4; + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < scaling_points[c].num_points; ++i) { + scaling_points[c].points[i][0] = + AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor); + scaling_points[c].points[i][1] = + AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor); + max_scaling_value = + AOMMAX(scaling_points[c].points[i][1], max_scaling_value); + } + } + + // Scaling_shift values are in the range [8,11] + const int max_scaling_value_log2 = + clamp((int)floor(log2(max_scaling_value) + 1), 2, 5); + film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2); + + const double scale_factor = 1 << (8 - max_scaling_value_log2); + film_grain->num_y_points = scaling_points[0].num_points; + film_grain->num_cb_points = scaling_points[1].num_points; + film_grain->num_cr_points = scaling_points[2].num_points; + + int(*film_grain_scaling[3])[2] = { + film_grain->scaling_points_y, + film_grain->scaling_points_cb, + film_grain->scaling_points_cr, + }; + for (int c = 0; c < 3; c++) { + for (int i = 0; i < scaling_points[c].num_points; ++i) { + film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5); + film_grain_scaling[c][i][1] = clamp( + (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255); + } + } + aom_noise_strength_lut_free(scaling_points + 0); + aom_noise_strength_lut_free(scaling_points + 1); + aom_noise_strength_lut_free(scaling_points + 2); + + // Convert the ar_coeffs into 8-bit values + const int n_coeff = noise_model->combined_state[0].eqns.n; + double max_coeff = 1e-4, min_coeff = -1e-4; + double y_corr[2] = { 0, 0 }; + double avg_luma_strength = 0; + for (int c = 0; c < 3; c++) { + aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns; + for (int i = 0; i < n_coeff; ++i) { + max_coeff = AOMMAX(max_coeff, eqns->x[i]); + min_coeff = AOMMIN(min_coeff, eqns->x[i]); + } + // Since the correlation between luma/chroma was computed in an already + // scaled space, we adjust it in the un-scaled space. + aom_noise_strength_solver_t *solver = + &noise_model->combined_state[c].strength_solver; + // Compute a weighted average of the strength for the channel. + double average_strength = 0, total_weight = 0; + for (int i = 0; i < solver->eqns.n; ++i) { + double w = 0; + for (int j = 0; j < solver->eqns.n; ++j) { + w += solver->eqns.A[i * solver->eqns.n + j]; + } + w = sqrt(w); + average_strength += solver->eqns.x[i] * w; + total_weight += w; + } + if (total_weight == 0) + average_strength = 1; + else + average_strength /= total_weight; + if (c == 0) { + avg_luma_strength = average_strength; + } else { + y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength; + max_coeff = AOMMAX(max_coeff, y_corr[c - 1]); + min_coeff = AOMMIN(min_coeff, y_corr[c - 1]); + } + } + // Shift value: AR coeffs range (values 6-9) + // 6: [-2, 2), 7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25) + film_grain->ar_coeff_shift = + clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))), + 6, 9); + double scale_ar_coeff = 1 << film_grain->ar_coeff_shift; + int *ar_coeffs[3] = { + film_grain->ar_coeffs_y, + film_grain->ar_coeffs_cb, + film_grain->ar_coeffs_cr, + }; + for (int c = 0; c < 3; ++c) { + aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns; + for (int i = 0; i < n_coeff; ++i) { + ar_coeffs[c][i] = + clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127); + } + if (c > 0) { + ar_coeffs[c][n_coeff] = + clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127); + } + } + + // At the moment, the noise modeling code assumes that the chroma scaling + // functions are a function of luma. + film_grain->cb_mult = 128; // 8 bits + film_grain->cb_luma_mult = 192; // 8 bits + film_grain->cb_offset = 256; // 9 bits + + film_grain->cr_mult = 128; // 8 bits + film_grain->cr_luma_mult = 192; // 8 bits + film_grain->cr_offset = 256; // 9 bits + + film_grain->chroma_scaling_from_luma = 0; + film_grain->grain_scale_shift = 0; + film_grain->overlap_flag = 1; + return 1; +} + +static void pointwise_multiply(const float *a, float *b, int n) { + for (int i = 0; i < n; ++i) { + b[i] *= a[i]; + } +} + +static float *get_half_cos_window(int block_size) { + float *window_function = + (float *)aom_malloc(block_size * block_size * sizeof(*window_function)); + for (int y = 0; y < block_size; ++y) { + const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2); + for (int x = 0; x < block_size; ++x) { + const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2); + window_function[y * block_size + x] = (float)(cos_yd * cos_xd); + } + } + return window_function; +} + +#define DITHER_AND_QUANTIZE(INT_TYPE, suffix) \ + static void dither_and_quantize_##suffix( \ + float *result, int result_stride, INT_TYPE *denoised, int w, int h, \ + int stride, int chroma_sub_w, int chroma_sub_h, int block_size, \ + float block_normalization) { \ + for (int y = 0; y < (h >> chroma_sub_h); ++y) { \ + for (int x = 0; x < (w >> chroma_sub_w); ++x) { \ + const int result_idx = \ + (y + (block_size >> chroma_sub_h)) * result_stride + x + \ + (block_size >> chroma_sub_w); \ + INT_TYPE new_val = (INT_TYPE)AOMMIN( \ + AOMMAX(result[result_idx] * block_normalization + 0.5f, 0), \ + block_normalization); \ + const float err = \ + -(((float)new_val) / block_normalization - result[result_idx]); \ + denoised[y * stride + x] = new_val; \ + if (x + 1 < (w >> chroma_sub_w)) { \ + result[result_idx + 1] += err * 7.0f / 16.0f; \ + } \ + if (y + 1 < (h >> chroma_sub_h)) { \ + if (x > 0) { \ + result[result_idx + result_stride - 1] += err * 3.0f / 16.0f; \ + } \ + result[result_idx + result_stride] += err * 5.0f / 16.0f; \ + if (x + 1 < (w >> chroma_sub_w)) { \ + result[result_idx + result_stride + 1] += err * 1.0f / 16.0f; \ + } \ + } \ + } \ + } \ + } + +DITHER_AND_QUANTIZE(uint8_t, lowbd); +DITHER_AND_QUANTIZE(uint16_t, highbd); + +int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], + int w, int h, int stride[3], int chroma_sub[2], + float *noise_psd[3], int block_size, int bit_depth, + int use_highbd) { + float *plane = NULL, *block = NULL, *window_full = NULL, + *window_chroma = NULL; + double *block_d = NULL, *plane_d = NULL; + struct aom_noise_tx_t *tx_full = NULL; + struct aom_noise_tx_t *tx_chroma = NULL; + const int num_blocks_w = (w + block_size - 1) / block_size; + const int num_blocks_h = (h + block_size - 1) / block_size; + const int result_stride = (num_blocks_w + 2) * block_size; + const int result_height = (num_blocks_h + 2) * block_size; + float *result = NULL; + int init_success = 1; + aom_flat_block_finder_t block_finder_full; + aom_flat_block_finder_t block_finder_chroma; + const float kBlockNormalization = (float)((1 << bit_depth) - 1); + if (chroma_sub[0] != chroma_sub[1]) { + fprintf(stderr, + "aom_wiener_denoise_2d doesn't handle different chroma " + "subsampling"); + return 0; + } + init_success &= aom_flat_block_finder_init(&block_finder_full, block_size, + bit_depth, use_highbd); + result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride * + sizeof(*result)); + plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane)); + block = + (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block)); + block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d)); + plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d)); + window_full = get_half_cos_window(block_size); + tx_full = aom_noise_tx_malloc(block_size); + + if (chroma_sub[0] != 0) { + init_success &= aom_flat_block_finder_init(&block_finder_chroma, + block_size >> chroma_sub[0], + bit_depth, use_highbd); + window_chroma = get_half_cos_window(block_size >> chroma_sub[0]); + tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]); + } else { + window_chroma = window_full; + tx_chroma = tx_full; + } + + init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) && + (plane_d != NULL) && (block != NULL) && (block_d != NULL) && + (window_full != NULL) && (window_chroma != NULL) && + (result != NULL); + for (int c = init_success ? 0 : 3; c < 3; ++c) { + float *window_function = c == 0 ? window_full : window_chroma; + aom_flat_block_finder_t *block_finder = &block_finder_full; + const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0; + const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0; + struct aom_noise_tx_t *tx = + (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full; + if (!data[c] || !denoised[c]) continue; + if (c > 0 && chroma_sub[0] != 0) { + block_finder = &block_finder_chroma; + } + memset(result, 0, sizeof(*result) * result_stride * result_height); + // Do overlapped block processing (half overlapped). The block rows can + // easily be done in parallel + for (int offsy = 0; offsy < (block_size >> chroma_sub_h); + offsy += (block_size >> chroma_sub_h) / 2) { + for (int offsx = 0; offsx < (block_size >> chroma_sub_w); + offsx += (block_size >> chroma_sub_w) / 2) { + // Pad the boundary when processing each block-set. + for (int by = -1; by < num_blocks_h; ++by) { + for (int bx = -1; bx < num_blocks_w; ++bx) { + const int pixels_per_block = + (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h); + aom_flat_block_finder_extract_block( + block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h, + stride[c], bx * (block_size >> chroma_sub_w) + offsx, + by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d); + for (int j = 0; j < pixels_per_block; ++j) { + block[j] = (float)block_d[j]; + plane[j] = (float)plane_d[j]; + } + pointwise_multiply(window_function, block, pixels_per_block); + aom_noise_tx_forward(tx, block); + aom_noise_tx_filter(tx, noise_psd[c]); + aom_noise_tx_inverse(tx, block); + + // Apply window function to the plane approximation (we will apply + // it to the sum of plane + block when composing the results). + pointwise_multiply(window_function, plane, pixels_per_block); + + for (int y = 0; y < (block_size >> chroma_sub_h); ++y) { + const int y_result = + y + (by + 1) * (block_size >> chroma_sub_h) + offsy; + for (int x = 0; x < (block_size >> chroma_sub_w); ++x) { + const int x_result = + x + (bx + 1) * (block_size >> chroma_sub_w) + offsx; + result[y_result * result_stride + x_result] += + (block[y * (block_size >> chroma_sub_w) + x] + + plane[y * (block_size >> chroma_sub_w) + x]) * + window_function[y * (block_size >> chroma_sub_w) + x]; + } + } + } + } + } + } + if (use_highbd) { + dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c], + w, h, stride[c], chroma_sub_w, chroma_sub_h, + block_size, kBlockNormalization); + } else { + dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h, + stride[c], chroma_sub_w, chroma_sub_h, + block_size, kBlockNormalization); + } + } + aom_free(result); + aom_free(plane); + aom_free(block); + aom_free(plane_d); + aom_free(block_d); + aom_free(window_full); + + aom_noise_tx_free(tx_full); + + aom_flat_block_finder_free(&block_finder_full); + if (chroma_sub[0] != 0) { + aom_flat_block_finder_free(&block_finder_chroma); + aom_free(window_chroma); + aom_noise_tx_free(tx_chroma); + } + return init_success; +} diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h new file mode 100644 index 000000000..dabeacc14 --- /dev/null +++ b/third_party/aom/aom_dsp/noise_model.h @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_NOISE_MODEL_H_ +#define AOM_DSP_NOISE_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +#include +#include "aom_dsp/grain_synthesis.h" + +/*!\brief Wrapper of data required to represent linear system of eqns and soln. + */ +typedef struct { + double *A; + double *b; + double *x; + int n; +} aom_equation_system_t; + +/*!\brief Representation of a piecewise linear curve + * + * Holds n points as (x, y) pairs, that store the curve. + */ +typedef struct { + double (*points)[2]; + int num_points; +} aom_noise_strength_lut_t; + +/*!\brief Init the noise strength lut with the given number of points*/ +int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points); + +/*!\brief Frees the noise strength lut. */ +void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut); + +/*!\brief Evaluate the lut at the point x. + * + * \param[in] lut The lut data. + * \param[in] x The coordinate to evaluate the lut. + */ +double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut, + double x); + +/*!\brief Helper struct to model noise strength as a function of intensity. + * + * Internally, this structure holds a representation of a linear system + * of equations that models noise strength (standard deviation) as a + * function of intensity. The mapping is initially stored using a + * piecewise representation with evenly spaced bins that cover the entire + * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a + * constraint of the form: + * y_{i} (1 - a) + y_{i+1} a = y + * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and + * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding + * normal equations. + * + * As there may be missing data, the solution is regularized to get a + * complete set of values for the bins. A reduced representation after + * solving can be obtained by getting the corresponding noise_strength_lut_t. + */ +typedef struct { + aom_equation_system_t eqns; + double min_intensity; + double max_intensity; + int num_bins; + int num_equations; + double total; +} aom_noise_strength_solver_t; + +/*!\brief Initializes the noise solver with the given number of bins. + * + * Returns 0 if initialization fails. + * + * \param[in] solver The noise solver to be initialized. + * \param[in] num_bins Number of bins to use in the internal representation. + * \param[in] bit_depth The bit depth used to derive {min,max}_intensity. + */ +int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver, + int num_bins, int bit_depth); +void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver); + +/*!\brief Gets the x coordinate of bin i. + * + * \param[in] i The bin whose coordinate to query. + */ +double aom_noise_strength_solver_get_center( + const aom_noise_strength_solver_t *solver, int i); + +/*!\brief Add an observation of the block mean intensity to its noise strength. + * + * \param[in] block_mean The average block intensity, + * \param[in] noise_std The observed noise strength. + */ +void aom_noise_strength_solver_add_measurement( + aom_noise_strength_solver_t *solver, double block_mean, double noise_std); + +/*!\brief Solves the current set of equations for the noise strength. */ +int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver); + +/*!\brief Fits a reduced piecewise linear lut to the internal solution + * + * \param[in] max_num_points The maximum number of output points + * \param[out] lut The output piecewise linear lut. + */ +int aom_noise_strength_solver_fit_piecewise( + const aom_noise_strength_solver_t *solver, int max_num_points, + aom_noise_strength_lut_t *lut); + +/*!\brief Helper for holding precomputed data for finding flat blocks. + * + * Internally a block is modeled with a low-order polynomial model. A + * planar model would be a bunch of equations like: + * <[y_i x_i 1], [a_1, a_2, a_3]> = b_i + * for each point in the block. The system matrix A with row i as [y_i x_i 1] + * is maintained as is the inverse, inv(A'*A), so that the plane parameters + * can be fit for each block. + */ +typedef struct { + double *AtA_inv; + double *A; + int num_params; // The number of parameters used for internal low-order model + int block_size; // The block size the finder was initialized with + double normalization; // Normalization factor (1 / (2^(bit_depth) - 1)) + int use_highbd; // Whether input data should be interpreted as uint16 +} aom_flat_block_finder_t; + +/*!\brief Init the block_finder with the given block size, bit_depth */ +int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder, + int block_size, int bit_depth, int use_highbd); +void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder); + +/*!\brief Helper to extract a block and low order "planar" model. */ +void aom_flat_block_finder_extract_block( + const aom_flat_block_finder_t *block_finder, const uint8_t *const data, + int w, int h, int stride, int offsx, int offsy, double *plane, + double *block); + +/*!\brief Runs the flat block finder on the input data. + * + * Find flat blocks in the input image data. Returns a map of + * flat_blocks, where the value of flat_blocks map will be non-zero + * when a block is determined to be flat. A higher value indicates a bigger + * confidence in the decision. + */ +int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, + const uint8_t *const data, int w, int h, + int stride, uint8_t *flat_blocks); + +// The noise shape indicates the allowed coefficients in the AR model. +typedef enum { + AOM_NOISE_SHAPE_DIAMOND = 0, + AOM_NOISE_SHAPE_SQUARE = 1 +} aom_noise_shape; + +// The parameters of the noise model include the shape type, lag, the +// bit depth of the input images provided, and whether the input images +// will be using uint16 (or uint8) representation. +typedef struct { + aom_noise_shape shape; + int lag; + int bit_depth; + int use_highbd; +} aom_noise_model_params_t; + +/*!\brief State of a noise model estimate for a single channel. + * + * This contains a system of equations that can be used to solve + * for the auto-regressive coefficients as well as a noise strength + * solver that can be used to model noise strength as a function of + * intensity. + */ +typedef struct { + aom_equation_system_t eqns; + aom_noise_strength_solver_t strength_solver; + int num_observations; // The number of observations in the eqn system + double ar_gain; // The gain of the current AR filter +} aom_noise_state_t; + +/*!\brief Complete model of noise for a planar video + * + * This includes a noise model for the latest frame and an aggregated + * estimate over all previous frames that had similar parameters. + */ +typedef struct { + aom_noise_model_params_t params; + aom_noise_state_t combined_state[3]; // Combined state per channel + aom_noise_state_t latest_state[3]; // Latest state per channel + int (*coords)[2]; // Offsets (x,y) of the coefficient samples + int n; // Number of parameters (size of coords) + int bit_depth; +} aom_noise_model_t; + +/*!\brief Result of a noise model update. */ +typedef enum { + AOM_NOISE_STATUS_OK = 0, + AOM_NOISE_STATUS_INVALID_ARGUMENT, + AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS, + AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, + AOM_NOISE_STATUS_INTERNAL_ERROR, +} aom_noise_status_t; + +/*!\brief Initializes a noise model with the given parameters. + * + * Returns 0 on failure. + */ +int aom_noise_model_init(aom_noise_model_t *model, + const aom_noise_model_params_t params); +void aom_noise_model_free(aom_noise_model_t *model); + +/*!\brief Updates the noise model with a new frame observation. + * + * Updates the noise model with measurements from the given input frame and a + * denoised variant of it. Noise is sampled from flat blocks using the flat + * block map. + * + * Returns a noise_status indicating if the update was successful. If the + * Update was successful, the combined_state is updated with measurements from + * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise + * state will be updated with measurements from the provided frame. + * + * \param[in,out] noise_model The noise model to be updated + * \param[in] data Raw frame data + * \param[in] denoised Denoised frame data. + * \param[in] w Frame width + * \param[in] h Frame height + * \param[in] strides Stride of the planes + * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0. + * \param[in] flat_blocks A map to blocks that have been determined flat + * \param[in] block_size The size of blocks. + */ +aom_noise_status_t aom_noise_model_update( + aom_noise_model_t *const noise_model, const uint8_t *const data[3], + const uint8_t *const denoised[3], int w, int h, int strides[3], + int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size); + +/*\brief Save the "latest" estimate into the "combined" estimate. + * + * This is meant to be called when the noise modeling detected a change + * in parameters (or for example, if a user wanted to reset estimation at + * a shot boundary). + */ +void aom_noise_model_save_latest(aom_noise_model_t *noise_model); + +/*!\brief Converts the noise_model parameters to the corresponding + * grain_parameters. + * + * The noise structs in this file are suitable for estimation (e.g., using + * floats), but the grain parameters in the bitstream are quantized. This + * function does the conversion by selecting the correct quantization levels. + */ +int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, + aom_film_grain_t *film_grain); + +/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd. + * + * \param[in] data Raw frame data + * \param[out] denoised Denoised frame data + * \param[in] w Frame width + * \param[in] h Frame height + * \param[in] stride Stride of the planes + * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0. + * \param[in] noise_psd The power spectral density of the noise + * \param[in] block_size The size of blocks + * \param[in] bit_depth Bit depth of the image + * \param[in] use_highbd If true, uint8 pointers are interpreted as + * uint16 and stride is measured in uint16. + * This must be true when bit_depth >= 10. + */ +int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], + int w, int h, int stride[3], int chroma_sub_log2[2], + float *noise_psd[3], int block_size, int bit_depth, + int use_highbd); +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus +#endif // AOM_DSP_NOISE_MODEL_H_ diff --git a/third_party/aom/aom_dsp/noise_util.c b/third_party/aom/aom_dsp/noise_util.c new file mode 100644 index 000000000..87e8e9fec --- /dev/null +++ b/third_party/aom/aom_dsp/noise_util.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include +#include +#include + +#include "aom_dsp/noise_util.h" +#include "aom_dsp/fft_common.h" +#include "aom_mem/aom_mem.h" +#include "config/aom_dsp_rtcd.h" + +float aom_noise_psd_get_default_value(int block_size, float factor) { + return (factor * factor / 10000) * block_size * block_size / 8; +} + +// Internal representation of noise transform. It keeps track of the +// transformed data and a temporary working buffer to use during the +// transform. +struct aom_noise_tx_t { + float *tx_block; + float *temp; + int block_size; + void (*fft)(const float *, float *, float *); + void (*ifft)(const float *, float *, float *); +}; + +struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) { + struct aom_noise_tx_t *noise_tx = + (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t)); + if (!noise_tx) return NULL; + memset(noise_tx, 0, sizeof(*noise_tx)); + switch (block_size) { + case 2: + noise_tx->fft = aom_fft2x2_float; + noise_tx->ifft = aom_ifft2x2_float; + break; + case 4: + noise_tx->fft = aom_fft4x4_float; + noise_tx->ifft = aom_ifft4x4_float; + break; + case 8: + noise_tx->fft = aom_fft8x8_float; + noise_tx->ifft = aom_ifft8x8_float; + break; + case 16: + noise_tx->fft = aom_fft16x16_float; + noise_tx->ifft = aom_ifft16x16_float; + break; + case 32: + noise_tx->fft = aom_fft32x32_float; + noise_tx->ifft = aom_ifft32x32_float; + break; + default: + aom_free(noise_tx); + fprintf(stderr, "Unsupported block size %d\n", block_size); + return NULL; + } + noise_tx->block_size = block_size; + noise_tx->tx_block = (float *)aom_memalign( + 32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size); + noise_tx->temp = (float *)aom_memalign( + 32, 2 * sizeof(*noise_tx->temp) * block_size * block_size); + if (!noise_tx->tx_block || !noise_tx->temp) { + aom_noise_tx_free(noise_tx); + return NULL; + } + // Clear the buffers up front. Some outputs of the forward transform are + // real only (the imaginary component will never be touched) + memset(noise_tx->tx_block, 0, + 2 * sizeof(*noise_tx->tx_block) * block_size * block_size); + memset(noise_tx->temp, 0, + 2 * sizeof(*noise_tx->temp) * block_size * block_size); + return noise_tx; +} + +void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) { + noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block); +} + +void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) { + const int block_size = noise_tx->block_size; + const float kBeta = 1.1f; + const float kEps = 1e-6f; + for (int y = 0; y < block_size; ++y) { + for (int x = 0; x < block_size; ++x) { + int i = y * block_size + x; + float *c = noise_tx->tx_block + 2 * i; + const float p = c[0] * c[0] + c[1] * c[1]; + if (p > kBeta * psd[i] && p > 1e-6) { + noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps); + noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps); + } else { + noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta; + noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta; + } + } + } +} + +void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) { + const int n = noise_tx->block_size * noise_tx->block_size; + noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data); + for (int i = 0; i < n; ++i) { + data[i] /= n; + } +} + +void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx, + float *psd) { + const int block_size = noise_tx->block_size; + for (int yb = 0; yb < block_size; ++yb) { + for (int xb = 0; xb <= block_size / 2; ++xb) { + float *c = noise_tx->tx_block + 2 * (yb * block_size + xb); + psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1]; + } + } +} + +void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) { + if (!noise_tx) return; + aom_free(noise_tx->tx_block); + aom_free(noise_tx->temp); + aom_free(noise_tx); +} + +double aom_normalized_cross_correlation(const double *a, const double *b, + int n) { + double c = 0; + double a_len = 0; + double b_len = 0; + for (int i = 0; i < n; ++i) { + a_len += a[i] * a[i]; + b_len += b[i] * b[i]; + c += a[i] * b[i]; + } + return c / (sqrt(a_len) * sqrt(b_len)); +} + +int aom_noise_data_validate(const double *data, int w, int h) { + const double kVarianceThreshold = 2; + const double kMeanThreshold = 2; + + int x = 0, y = 0; + int ret_value = 1; + double var = 0, mean = 0; + double *mean_x, *mean_y, *var_x, *var_y; + + // Check that noise variance is not increasing in x or y + // and that the data is zero mean. + mean_x = (double *)aom_malloc(sizeof(*mean_x) * w); + var_x = (double *)aom_malloc(sizeof(*var_x) * w); + mean_y = (double *)aom_malloc(sizeof(*mean_x) * h); + var_y = (double *)aom_malloc(sizeof(*var_y) * h); + + memset(mean_x, 0, sizeof(*mean_x) * w); + memset(var_x, 0, sizeof(*var_x) * w); + memset(mean_y, 0, sizeof(*mean_y) * h); + memset(var_y, 0, sizeof(*var_y) * h); + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + const double d = data[y * w + x]; + var_x[x] += d * d; + var_y[y] += d * d; + mean_x[x] += d; + mean_y[y] += d; + var += d * d; + mean += d; + } + } + mean /= (w * h); + var = var / (w * h) - mean * mean; + + for (y = 0; y < h; ++y) { + mean_y[y] /= h; + var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y]; + if (fabs(var_y[y] - var) >= kVarianceThreshold) { + fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var); + ret_value = 0; + break; + } + if (fabs(mean_y[y] - mean) >= kMeanThreshold) { + fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean); + ret_value = 0; + break; + } + } + + for (x = 0; x < w; ++x) { + mean_x[x] /= w; + var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x]; + if (fabs(var_x[x] - var) >= kVarianceThreshold) { + fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var); + ret_value = 0; + break; + } + if (fabs(mean_x[x] - mean) >= kMeanThreshold) { + fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean); + ret_value = 0; + break; + } + } + + aom_free(mean_x); + aom_free(mean_y); + aom_free(var_x); + aom_free(var_y); + + return ret_value; +} diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h new file mode 100644 index 000000000..ea4d9e3de --- /dev/null +++ b/third_party/aom/aom_dsp/noise_util.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_NOISE_UTIL_H_ +#define AOM_DSP_NOISE_UTIL_H_ + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +// aom_noise_tx_t is an abstraction of a transform that is used for denoising. +// It is meant to be lightweight and does hold the transformed data (as +// the user should not be manipulating the transformed data directly). +struct aom_noise_tx_t; + +// Allocates and returns a aom_noise_tx_t useful for denoising the given +// block_size. The resulting aom_noise_tx_t should be free'd with +// aom_noise_tx_free. +struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size); +void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx); + +// Transforms the internal data and holds it in the aom_noise_tx's internal +// buffer. For compatibility with existing SIMD implementations, "data" must +// be 32-byte aligned. +void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx, + const float *data); + +// Filters aom_noise_tx's internal data using the provided noise power spectral +// density. The PSD must be at least block_size * block_size and should be +// populated with a constant or via estimates taken from +// aom_noise_tx_add_energy. +void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd); + +// Performs an inverse transform using the internal transform data. +// For compatibility with existing SIMD implementations, "data" must be 32-byte +// aligned. +void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data); + +// Aggregates the power of the buffered transform data into the psd buffer. +void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx, + float *psd); + +// Returns a default value suitable for denosing a transform of the given +// block_size. The noise "factor" determines the strength of the noise to +// be removed. A value of about 2.5 can be used for moderate denoising, +// where a value of 5.0 can be used for a high level of denoising. +float aom_noise_psd_get_default_value(int block_size, float factor); + +// Computes normalized cross correlation of two vectors a and b of length n. +double aom_normalized_cross_correlation(const double *a, const double *b, + int n); + +// Validates the correlated noise in the data buffer of size (w, h). +int aom_noise_data_validate(const double *data, int w, int h); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // AOM_DSP_NOISE_UTIL_H_ diff --git a/third_party/aom/aom_dsp/prob.c b/third_party/aom/aom_dsp/prob.c deleted file mode 100644 index a42fb806b..000000000 --- a/third_party/aom/aom_dsp/prob.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" - -#include - -#include "aom_dsp/prob.h" - -static unsigned int tree_merge_probs_impl(unsigned int i, - const aom_tree_index *tree, - const aom_prob *pre_probs, - const unsigned int *counts, - aom_prob *probs) { - const int l = tree[i]; - const unsigned int left_count = - (l <= 0) ? counts[-l] - : tree_merge_probs_impl(l, tree, pre_probs, counts, probs); - const int r = tree[i + 1]; - const unsigned int right_count = - (r <= 0) ? counts[-r] - : tree_merge_probs_impl(r, tree, pre_probs, counts, probs); - const unsigned int ct[2] = { left_count, right_count }; - probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct); - return left_count + right_count; -} - -void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs, - const unsigned int *counts, aom_prob *probs) { - tree_merge_probs_impl(0, tree, pre_probs, counts, probs); -} - -typedef struct tree_node tree_node; - -struct tree_node { - aom_tree_index index; - uint8_t probs[16]; - uint8_t prob; - int path; - int len; - int l; - int r; - aom_cdf_prob pdf; -}; - -/* Compute the probability of this node in Q23 */ -static uint32_t tree_node_prob(tree_node n, int i) { - uint32_t prob; - /* 1.0 in Q23 */ - prob = 16777216; - for (; i < n.len; i++) { - prob = prob * n.probs[i] >> 8; - } - return prob; -} - -static int tree_node_cmp(tree_node a, tree_node b) { - int i; - uint32_t pa; - uint32_t pb; - for (i = 0; i < AOMMIN(a.len, b.len) && a.probs[i] == b.probs[i]; i++) { - } - pa = tree_node_prob(a, i); - pb = tree_node_prob(b, i); - return pa > pb ? 1 : pa < pb ? -1 : 0; -} - -/* Given a Q15 probability for symbol subtree rooted at tree[n], this function - computes the probability of each symbol (defined as a node that has no - children). */ -static aom_cdf_prob tree_node_compute_probs(tree_node *tree, int n, - aom_cdf_prob pdf) { - if (tree[n].l == 0) { - /* This prevents probability computations in Q15 that underflow from - producing a symbol that has zero probability. */ - if (pdf == 0) pdf = 1; - tree[n].pdf = pdf; - return pdf; - } else { - /* We process the smaller probability first, */ - if (tree[n].prob < 128) { - aom_cdf_prob lp; - aom_cdf_prob rp; - lp = (((uint32_t)pdf) * tree[n].prob + 128) >> 8; - lp = tree_node_compute_probs(tree, tree[n].l, lp); - rp = tree_node_compute_probs(tree, tree[n].r, lp > pdf ? 0 : pdf - lp); - return lp + rp; - } else { - aom_cdf_prob rp; - aom_cdf_prob lp; - rp = (((uint32_t)pdf) * (256 - tree[n].prob) + 128) >> 8; - rp = tree_node_compute_probs(tree, tree[n].r, rp); - lp = tree_node_compute_probs(tree, tree[n].l, rp > pdf ? 0 : pdf - rp); - return lp + rp; - } - } -} - -static int tree_node_extract(tree_node *tree, int n, int symb, - aom_cdf_prob *pdf, aom_tree_index *index, - int *path, int *len) { - if (tree[n].l == 0) { - pdf[symb] = tree[n].pdf; - if (index != NULL) index[symb] = tree[n].index; - if (path != NULL) path[symb] = tree[n].path; - if (len != NULL) len[symb] = tree[n].len; - return symb + 1; - } else { - symb = tree_node_extract(tree, tree[n].l, symb, pdf, index, path, len); - return tree_node_extract(tree, tree[n].r, symb, pdf, index, path, len); - } -} - -int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs, - aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *index, - int *path, int *len) { - tree_node symb[2 * 16 - 1]; - int nodes; - int next[16]; - int size; - int nsymbs; - int i; - /* Create the root node with probability 1 in Q15. */ - symb[0].index = root; - symb[0].path = 0; - symb[0].len = 0; - symb[0].l = symb[0].r = 0; - nodes = 1; - next[0] = 0; - size = 1; - nsymbs = 1; - while (size > 0 && nsymbs < 16) { - int m; - tree_node n; - aom_tree_index j; - uint8_t prob; - m = 0; - /* Find the internal node with the largest probability. */ - for (i = 1; i < size; i++) { - if (tree_node_cmp(symb[next[i]], symb[next[m]]) > 0) m = i; - } - i = next[m]; - memmove(&next[m], &next[m + 1], sizeof(*next) * (size - (m + 1))); - size--; - /* Split this symbol into two symbols */ - n = symb[i]; - j = n.index; - prob = probs[j >> 1]; - /* Left */ - n.index = tree[j]; - n.path <<= 1; - n.len++; - n.probs[n.len - 1] = prob; - symb[nodes] = n; - if (n.index > 0) { - next[size++] = nodes; - } - /* Right */ - n.index = tree[j + 1]; - n.path += 1; - n.probs[n.len - 1] = 256 - prob; - symb[nodes + 1] = n; - if (n.index > 0) { - next[size++] = nodes + 1; - } - symb[i].prob = prob; - symb[i].l = nodes; - symb[i].r = nodes + 1; - nodes += 2; - nsymbs++; - } - /* Compute the probabilities of each symbol in Q15 */ - tree_node_compute_probs(symb, 0, CDF_PROB_TOP); - /* Extract the cdf, index, path and length */ - tree_node_extract(symb, 0, 0, cdf, index, path, len); - /* Convert to CDF */ - cdf[0] = AOM_ICDF(cdf[0]); - for (i = 1; i < nsymbs; i++) { - cdf[i] = AOM_ICDF(AOM_ICDF(cdf[i - 1]) + cdf[i]); - } - // Store symbol count at the end of the CDF - cdf[nsymbs] = 0; - return nsymbs; -} - -/* This code assumes that tree contains as unique leaf nodes the integer values - 0 to len - 1 and produces the forward and inverse mapping tables in ind[] - and inv[] respectively. */ -static void tree_to_index(int *stack_index, int *ind, int *inv, - const aom_tree_index *tree, int value, int index) { - value *= 2; - - do { - const aom_tree_index content = tree[index]; - ++index; - if (content <= 0) { - inv[*stack_index] = -content; - ind[-content] = *stack_index; - ++(*stack_index); - } else { - tree_to_index(stack_index, ind, inv, tree, value, content); - } - } while (++value & 1); -} - -void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree) { - int stack_index = 0; - tree_to_index(&stack_index, ind, inv, tree, 0, 0); -} diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h index a517e810a..85dd4249d 100644 --- a/third_party/aom/aom_dsp/prob.h +++ b/third_party/aom/aom_dsp/prob.h @@ -13,194 +13,657 @@ #define AOM_DSP_PROB_H_ #include +#include -#include "./aom_config.h" -#include "./aom_dsp_common.h" +#include "config/aom_config.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/entcode.h" #include "aom_ports/bitops.h" #include "aom_ports/mem.h" -#if !CONFIG_ANS -#include "aom_dsp/entcode.h" -#endif - #ifdef __cplusplus extern "C" { #endif -typedef uint8_t aom_prob; - // TODO(negge): Rename this aom_prob once we remove vpxbool. typedef uint16_t aom_cdf_prob; #define CDF_SIZE(x) ((x) + 1) - #define CDF_PROB_BITS 15 #define CDF_PROB_TOP (1 << CDF_PROB_BITS) +#define CDF_INIT_TOP 32768 +#define CDF_SHIFT (15 - CDF_PROB_BITS) +/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative + probability (an "inverse" CDF). + This function converts from one representation to the other (and is its own + inverse).*/ +#define AOM_ICDF(x) (CDF_PROB_TOP - (x)) + +#if CDF_SHIFT == 0 + +#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF4(a0, a1, a2) \ + AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF5(a0, a1, a2, a3) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF6(a0, a1, a2, a3, a4) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF7(a0, a1, a2, a3, a4, a5) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \ + a14) \ + AOM_ICDF(a0) \ + , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \ + AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \ + AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14), \ + AOM_ICDF(CDF_PROB_TOP), 0 -#if !CONFIG_ANS -#define AOM_ICDF OD_ICDF #else -#define AOM_ICDF(x) (x) -#endif - -#define MAX_PROB 255 - -#define LV_MAP_PROB 1 +#define AOM_CDF2(a0) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 2) + \ + ((CDF_INIT_TOP - 2) >> 1)) / \ + ((CDF_INIT_TOP - 2)) + \ + 1) \ + , AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF3(a0, a1) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \ + ((CDF_INIT_TOP - 3) >> 1)) / \ + ((CDF_INIT_TOP - 3)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \ + ((CDF_INIT_TOP - 3) >> 1)) / \ + ((CDF_INIT_TOP - 3)) + \ + 2), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF4(a0, a1, a2) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \ + ((CDF_INIT_TOP - 4) >> 1)) / \ + ((CDF_INIT_TOP - 4)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \ + ((CDF_INIT_TOP - 4) >> 1)) / \ + ((CDF_INIT_TOP - 4)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \ + ((CDF_INIT_TOP - 4) >> 1)) / \ + ((CDF_INIT_TOP - 4)) + \ + 3), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF5(a0, a1, a2, a3) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ + ((CDF_INIT_TOP - 5) >> 1)) / \ + ((CDF_INIT_TOP - 5)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ + ((CDF_INIT_TOP - 5) >> 1)) / \ + ((CDF_INIT_TOP - 5)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ + ((CDF_INIT_TOP - 5) >> 1)) / \ + ((CDF_INIT_TOP - 5)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \ + ((CDF_INIT_TOP - 5) >> 1)) / \ + ((CDF_INIT_TOP - 5)) + \ + 4), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF6(a0, a1, a2, a3, a4) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ + ((CDF_INIT_TOP - 6) >> 1)) / \ + ((CDF_INIT_TOP - 6)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ + ((CDF_INIT_TOP - 6) >> 1)) / \ + ((CDF_INIT_TOP - 6)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ + ((CDF_INIT_TOP - 6) >> 1)) / \ + ((CDF_INIT_TOP - 6)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ + ((CDF_INIT_TOP - 6) >> 1)) / \ + ((CDF_INIT_TOP - 6)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \ + ((CDF_INIT_TOP - 6) >> 1)) / \ + ((CDF_INIT_TOP - 6)) + \ + 5), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF7(a0, a1, a2, a3, a4, a5) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \ + ((CDF_INIT_TOP - 7) >> 1)) / \ + ((CDF_INIT_TOP - 7)) + \ + 6), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \ + ((CDF_INIT_TOP - 8) >> 1)) / \ + ((CDF_INIT_TOP - 8)) + \ + 7), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \ + ((CDF_INIT_TOP - 9) >> 1)) / \ + ((CDF_INIT_TOP - 9)) + \ + 8), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \ + ((CDF_INIT_TOP - 10) >> 1)) / \ + ((CDF_INIT_TOP - 10)) + \ + 9), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \ + ((CDF_INIT_TOP - 11) >> 1)) / \ + ((CDF_INIT_TOP - 11)) + \ + 10), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 10), \ + AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \ + ((CDF_INIT_TOP - 12) >> 1)) / \ + ((CDF_INIT_TOP - 12)) + \ + 11), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 10), \ + AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 11), \ + AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \ + ((CDF_INIT_TOP - 13) >> 1)) / \ + ((CDF_INIT_TOP - 13)) + \ + 12), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 10), \ + AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 11), \ + AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 12), \ + AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \ + ((CDF_INIT_TOP - 14) >> 1)) / \ + ((CDF_INIT_TOP - 14)) + \ + 13), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 10), \ + AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 11), \ + AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 12), \ + AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 13), \ + AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \ + ((CDF_INIT_TOP - 15) >> 1)) / \ + ((CDF_INIT_TOP - 15)) + \ + 14), \ + AOM_ICDF(CDF_PROB_TOP), 0 +#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \ + a14) \ + AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 1) \ + , \ + AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 2), \ + AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 3), \ + AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 4), \ + AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 5), \ + AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 6), \ + AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 7), \ + AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 8), \ + AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 9), \ + AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 10), \ + AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 11), \ + AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 12), \ + AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 13), \ + AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 14), \ + AOM_ICDF((((a14)-15) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \ + ((CDF_INIT_TOP - 16) >> 1)) / \ + ((CDF_INIT_TOP - 16)) + \ + 15), \ + AOM_ICDF(CDF_PROB_TOP), 0 -#define BR_NODE 1 - -#if CONFIG_ADAPT_SCAN -#define CACHE_SCAN_PROB 1 #endif -#define aom_prob_half ((aom_prob)128) - -typedef int8_t aom_tree_index; - -#define TREE_SIZE(leaf_count) (-2 + 2 * (leaf_count)) - -#define MODE_MV_COUNT_SAT 20 - -/* We build coding trees compactly in arrays. - Each node of the tree is a pair of aom_tree_indices. - Array index often references a corresponding probability table. - Index <= 0 means done encoding/decoding and value = -Index, - Index > 0 means need another bit, specification at index. - Nonnegative indices are always even; processing begins at node 0. */ - -typedef const aom_tree_index aom_tree[]; - -static INLINE aom_prob get_prob(unsigned int num, unsigned int den) { +static INLINE uint8_t get_prob(unsigned int num, unsigned int den) { assert(den != 0); { const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); // (p > 255) ? 255 : (p < 1) ? 1 : p; const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); - return (aom_prob)clipped_prob; + return (uint8_t)clipped_prob; } } -static INLINE aom_prob get_binary_prob(unsigned int n0, unsigned int n1) { - const unsigned int den = n0 + n1; - if (den == 0) return 128u; - return get_prob(n0, den); -} - -/* This function assumes prob1 and prob2 are already within [1,255] range. */ -static INLINE aom_prob weighted_prob(int prob1, int prob2, int factor) { - return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); -} - -static INLINE aom_prob merge_probs(aom_prob pre_prob, const unsigned int ct[2], - unsigned int count_sat, - unsigned int max_update_factor) { - const aom_prob prob = get_binary_prob(ct[0], ct[1]); - const unsigned int count = AOMMIN(ct[0] + ct[1], count_sat); - const unsigned int factor = max_update_factor * count / count_sat; - return weighted_prob(pre_prob, prob, factor); -} - -// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT; -static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = { - 0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, - 70, 76, 83, 89, 96, 102, 108, 115, 121, 128 -}; - -static INLINE aom_prob mode_mv_merge_probs(aom_prob pre_prob, - const unsigned int ct[2]) { - const unsigned int den = ct[0] + ct[1]; - if (den == 0) { - return pre_prob; - } else { - const unsigned int count = AOMMIN(den, MODE_MV_COUNT_SAT); - const unsigned int factor = count_to_update_factor[count]; - const aom_prob prob = get_prob(ct[0], den); - return weighted_prob(pre_prob, prob, factor); - } -} - -void aom_tree_merge_probs(const aom_tree_index *tree, const aom_prob *pre_probs, - const unsigned int *counts, aom_prob *probs); - -int tree_to_cdf(const aom_tree_index *tree, const aom_prob *probs, - aom_tree_index root, aom_cdf_prob *cdf, aom_tree_index *ind, - int *pth, int *len); - -static INLINE void av1_tree_to_cdf(const aom_tree_index *tree, - const aom_prob *probs, aom_cdf_prob *cdf) { - aom_tree_index index[16]; - int path[16]; - int dist[16]; - tree_to_cdf(tree, probs, 0, cdf, index, path, dist); -} - -#define av1_tree_to_cdf_1D(tree, probs, cdf, u) \ - do { \ - int i; \ - for (i = 0; i < u; i++) { \ - av1_tree_to_cdf(tree, probs[i], cdf[i]); \ - } \ - } while (0) - -#define av1_tree_to_cdf_2D(tree, probs, cdf, v, u) \ - do { \ - int j; \ - int i; \ - for (j = 0; j < v; j++) { \ - for (i = 0; i < u; i++) { \ - av1_tree_to_cdf(tree, probs[j][i], cdf[j][i]); \ - } \ - } \ - } while (0) - -void av1_indices_from_tree(int *ind, int *inv, const aom_tree_index *tree); - static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { - int rate = 4 + (cdf[nsymbs] > 31) + get_msb(nsymbs); -#if CONFIG_LV_MAP - if (nsymbs == 2) - rate = 4 + (cdf[nsymbs] > 7) + (cdf[nsymbs] > 15) + get_msb(nsymbs); -#endif - const int rate2 = 5; + int rate; int i, tmp; - int diff; -#if 1 - const int tmp0 = 1 << rate2; - tmp = AOM_ICDF(tmp0); - diff = ((CDF_PROB_TOP - (nsymbs << rate2)) >> rate) << rate; -// Single loop (faster) -#if !CONFIG_ANS - for (i = 0; i < nsymbs - 1; ++i, tmp -= tmp0) { - tmp -= (i == val ? diff : 0); - cdf[i] += ((tmp - cdf[i]) >> rate); - } -#else - for (i = 0; i < nsymbs - 1; ++i, tmp += tmp0) { - tmp += (i == val ? diff : 0); - cdf[i] -= ((cdf[i] - tmp) >> rate); - } -#endif -#else - for (i = 0; i < nsymbs; ++i) { - tmp = (i + 1) << rate2; - cdf[i] -= ((cdf[i] - tmp) >> rate); - } - diff = CDF_PROB_TOP - cdf[nsymbs - 1]; - for (i = val; i < nsymbs; ++i) { - cdf[i] += diff; + static const int nsymbs2speed[17] = { 0, 0, 1, 1, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2 }; + assert(nsymbs < 17); + rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) + + nsymbs2speed[nsymbs]; // + get_msb(nsymbs); + tmp = AOM_ICDF(0); + + // Single loop (faster) + for (i = 0; i < nsymbs - 1; ++i) { + tmp = (i == val) ? 0 : tmp; + if (tmp < cdf[i]) { + cdf[i] -= ((cdf[i] - tmp) >> rate); + } else { + cdf[i] += ((tmp - cdf[i]) >> rate); + } } -#endif cdf[nsymbs] += (cdf[nsymbs] < 32); } -#if CONFIG_LV_MAP -static INLINE void update_bin(aom_cdf_prob *cdf, int val, int nsymbs) { - update_cdf(cdf, val, nsymbs); -} -#endif - #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c index d543f12d1..37d3bb585 100644 --- a/third_party/aom/aom_dsp/psnr.c +++ b/third_party/aom/aom_dsp/psnr.c @@ -12,7 +12,8 @@ #include #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/psnr.h" #include "aom_scale/yv12config.h" @@ -26,8 +27,8 @@ double aom_sse_to_psnr(double samples, double peak, double sse) { } /* TODO(yaowu): The block_variance calls the unoptimized versions of variance() -* and highbd_8_variance(). It should not. -*/ + * and highbd_8_variance(). It should not. + */ static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h, unsigned int *sse, int *sum) { @@ -48,26 +49,26 @@ static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, } } -#if CONFIG_HIGHBITDEPTH static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int w, int h, uint64_t *sse, int64_t *sum) { - int i, j; - - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t tsum = 0; + uint64_t tsse = 0; + for (int i = 0; i < h; ++i) { + int32_t lsum = 0; + for (int j = 0; j < w; ++j) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + lsum += diff; + tsse += (uint32_t)(diff * diff); } + tsum += lsum; a += a_stride; b += b_stride; } + *sum = tsum; + *sse = tsse; } static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, @@ -80,7 +81,6 @@ static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, *sse = (unsigned int)sse_long; *sum = (int)sum_long; } -#endif // CONFIG_HIGHBITDEPTH static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { @@ -122,7 +122,6 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, return total_sse; } -#if CONFIG_HIGHBITDEPTH static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height, unsigned int input_shift) { @@ -175,7 +174,6 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, } return total_sse; } -#endif // CONFIG_HIGHBITDEPTH int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, @@ -228,7 +226,6 @@ int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, a->uv_crop_width, a->uv_crop_height); } -#if CONFIG_HIGHBITDEPTH int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height) { @@ -287,11 +284,9 @@ int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride, a->uv_crop_width, a->uv_crop_height); } -#endif // CONFIG_HIGHBITDEPTH int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int plane, int highbd) { -#if CONFIG_HIGHBITDEPTH if (highbd) { switch (plane) { case 0: return aom_highbd_get_y_sse(a, b); @@ -300,7 +295,6 @@ int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, default: assert(plane >= 0 && plane <= 2); return 0; } } -#endif (void)highbd; switch (plane) { case 0: return aom_get_y_sse(a, b); @@ -310,7 +304,6 @@ int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, } } -#if CONFIG_HIGHBITDEPTH void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, uint32_t bit_depth, uint32_t in_bit_depth) { @@ -356,8 +349,6 @@ void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, aom_sse_to_psnr((double)total_samples, peak, (double)total_sse); } -#endif // !CONFIG_HIGHBITDEPTH - void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr) { static const double peak = 255.0; diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h index df5f8f9f2..8300b0a88 100644 --- a/third_party/aom/aom_dsp/psnr.h +++ b/third_party/aom/aom_dsp/psnr.h @@ -27,13 +27,13 @@ typedef struct { } PSNR_STATS; /*!\brief Converts SSE to PSNR -* -* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). -* -* \param[in] samples Number of samples -* \param[in] peak Max sample value -* \param[in] sse Sum of squared errors -*/ + * + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). + * + * \param[in] samples Number of samples + * \param[in] peak Max sample value + * \param[in] sse Sum of squared errors + */ double aom_sse_to_psnr(double samples, double peak, double sse); int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, @@ -49,7 +49,6 @@ int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a, int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b); int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int plane, int highbd); -#if CONFIG_HIGHBITDEPTH int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, int hstart, int width, int vstart, int height); @@ -68,7 +67,6 @@ int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a, void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr, unsigned int bit_depth, unsigned int in_bit_depth); -#endif void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr); diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c index aeefd5908..30fe21d9c 100644 --- a/third_party/aom/aom_dsp/psnrhvs.c +++ b/third_party/aom/aom_dsp/psnrhvs.c @@ -17,17 +17,13 @@ #include #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/psnr.h" #include "aom_dsp/ssim.h" #include "aom_ports/system_state.h" -#if !defined(M_PI) -#define M_PI (3.141592653589793238462643) -#endif -#include - static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, int xstride) { int i, j; @@ -38,7 +34,6 @@ static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; } -#if CONFIG_HIGHBITDEPTH static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, int xstride) { int i, j; @@ -48,7 +43,6 @@ static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, for (j = 0; j < 8; j++) *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3; } -#endif /* Normalized inverse quantization matrix for 8x8 DCT at the point of * transparency. This is not the JPEG based matrix from the paper, @@ -123,14 +117,16 @@ static double convert_score_db(double _score, double _weight, int bit_depth) { static double calc_psnrhvs(const unsigned char *src, int _systride, const unsigned char *dst, int _dystride, double _par, int _w, int _h, int _step, const double _csf[8][8], - uint32_t bit_depth, uint32_t _shift) { + uint32_t _shift, int buf_is_hbd) { double ret; const uint8_t *_src8 = src; const uint8_t *_dst8 = dst; const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src); const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst); - int16_t dct_s[8 * 8], dct_d[8 * 8]; - tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8]; + DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]); + DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]); + DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]); double mask[8][8]; int pixels; int x; @@ -176,10 +172,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, for (i = 0; i < 8; i++) { for (j = 0; j < 8; j++) { int sub = ((i & 12) >> 2) + ((j & 12) >> 1); - if (bit_depth == 8 && _shift == 0) { + if (!buf_is_hbd) { dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)]; dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)]; - } else if (bit_depth == 10 || bit_depth == 12) { + } else { dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift; dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift; } @@ -212,15 +208,12 @@ static double calc_psnrhvs(const unsigned char *src, int _systride, s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar; if (d_gvar > 0) d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar; -#if CONFIG_HIGHBITDEPTH - if (bit_depth == 10 || bit_depth == 12) { - hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); - hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); - } -#endif - if (bit_depth == 8) { + if (!buf_is_hbd) { od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); + } else { + hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8); + hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8); } for (i = 0; i < 8; i++) for (j = (i == 0); j < 8; j++) @@ -256,21 +249,24 @@ double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst, const int step = 7; uint32_t bd_shift = 0; aom_clear_system_state(); - assert(bd == 8 || bd == 10 || bd == 12); assert(bd >= in_bd); + assert(src->flags == dst->flags); + const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH; bd_shift = bd - in_bd; - *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, - dst->y_stride, par, src->y_crop_width, - src->y_crop_height, step, csf_y, bd, bd_shift); - *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, - dst->uv_stride, par, src->uv_crop_width, - src->uv_crop_height, step, csf_cb420, bd, bd_shift); - *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, - dst->uv_stride, par, src->uv_crop_width, - src->uv_crop_height, step, csf_cr420, bd, bd_shift); + *y_psnrhvs = calc_psnrhvs( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, par, + src->y_crop_width, src->y_crop_height, step, csf_y, bd_shift, buf_is_hbd); + *u_psnrhvs = + calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride, + par, src->uv_crop_width, src->uv_crop_height, step, + csf_cb420, bd_shift, buf_is_hbd); + *v_psnrhvs = + calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride, + par, src->uv_crop_width, src->uv_crop_height, step, + csf_cr420, bd_shift, buf_is_hbd); psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs)); return convert_score_db(psnrhvs, 1.0, in_bd); } diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c index 21bcc486a..e1601cc7d 100644 --- a/third_party/aom/aom_dsp/quantize.c +++ b/third_party/aom/aom_dsp/quantize.c @@ -66,7 +66,8 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / (1 << log_scale); + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (tmp32) eob = i; } @@ -87,11 +88,7 @@ void highbd_quantize_b_helper_c( ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; int dequant; -#if CONFIG_TX64X64 int idx_arr[4096]; -#else - int idx_arr[1024]; -#endif (void)iscan; int idx = 0; @@ -130,45 +127,14 @@ void highbd_quantize_b_helper_c( qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale); + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); if (abs_qcoeff) eob = idx_arr[i]; } } *eob_ptr = eob + 1; } -void quantize_dc_helper(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, - uint16_t *eob_ptr, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr, const int log_scale) { - const int rc = 0; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int64_t tmp; - int eob = -1; - int32_t tmp32; - int dequant; - - memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); - memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - - if (!skip_block) { - const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); - tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), - INT16_MIN, INT16_MAX); - tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; - dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / (1 << log_scale); - if (tmp32) eob = 0; - } - *eob_ptr = eob + 1; -} - /* These functions should only be called when quantisation matrices are not used. */ void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -195,7 +161,6 @@ void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); } -#if CONFIG_TX64X64 void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -207,34 +172,6 @@ void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); } -#endif // CONFIG_TX64X64 - -void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { - quantize_dc_helper(coeff_ptr, n_coeffs, skip_block, round_ptr, quant, - qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, - 0); -} - -void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { - quantize_dc_helper(coeff_ptr, 1024, skip_block, round_ptr, quant, qcoeff_ptr, - dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 1); -} - -#if CONFIG_TX64X64 -void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr) { - quantize_dc_helper(coeff_ptr, 4096, skip_block, round_ptr, quant, qcoeff_ptr, - dqcoeff_ptr, dequant_ptr, eob_ptr, NULL, NULL, 2); -} -#endif // CONFIG_TX64X64 void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, @@ -261,7 +198,6 @@ void aom_highbd_quantize_b_32x32_c( NULL, NULL, 1); } -#if CONFIG_TX64X64 void aom_highbd_quantize_b_64x64_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -273,4 +209,3 @@ void aom_highbd_quantize_b_64x64_c( dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); } -#endif // CONFIG_TX64X64 diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h index 03609e8b4..56d50b929 100644 --- a/third_party/aom/aom_dsp/quantize.h +++ b/third_party/aom/aom_dsp/quantize.h @@ -12,7 +12,8 @@ #ifndef AOM_DSP_QUANTIZE_H_ #define AOM_DSP_QUANTIZE_H_ -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom_dsp/aom_dsp_common.h" #ifdef __cplusplus @@ -44,7 +45,6 @@ void highbd_quantize_b_helper_c( const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale); -#if CONFIG_HIGHBITDEPTH void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -52,69 +52,6 @@ void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#endif - -void aom_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -void aom_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -#if CONFIG_TX64X64 -void aom_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -#endif // CONFIG_TX64X64 - -#if CONFIG_AOM_QM -#if CONFIG_HIGHBITDEPTH -void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, - uint16_t *eob_ptr, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr); -void aom_highbd_quantize_dc_32x32( - const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, - const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr); -#if CONFIG_TX64X64 -void aom_highbd_quantize_dc_64x64( - const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, - const int16_t quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr, const qm_val_t *qm_ptr, - const qm_val_t *iqm_ptr); -#endif // CONFIG_TX64X64 -#endif // CONFIG_HIGHBITDEPTH - -#else // CONFIG_AOM_QM - -#if CONFIG_HIGHBITDEPTH -void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, - int skip_block, const int16_t *round_ptr, - const int16_t quant_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, - uint16_t *eob_ptr); -void aom_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, - const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -#if CONFIG_TX64X64 -void aom_highbd_quantize_dc_64x64(const tran_low_t *coeff_ptr, int skip_block, - const int16_t *round_ptr, - const int16_t quant_ptr, - tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, - const int16_t dequant_ptr, uint16_t *eob_ptr); -#endif // CONFIG_TX64X64 -#endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_AOM_QM #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c index 6b8ca669b..ede4c583b 100644 --- a/third_party/aom/aom_dsp/sad.c +++ b/third_party/aom/aom_dsp/sad.c @@ -11,8 +11,8 @@ #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" @@ -33,32 +33,35 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, return sad; } -#define sadMxN(m, n) \ - unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - return sad(src, src_stride, ref, ref_stride, m, n); \ - } \ - unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - uint8_t comp_pred[m * n]; \ - aom_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ - return sad(src, src_stride, comp_pred, m, m, n); \ +#define sadMxh(m) \ + unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride, int width, \ + int height) { \ + return sad(a, a_stride, b, b_stride, width, height); \ } -// depending on call sites, pass **ref_array to avoid & in subsequent call and -// de-dup with 4D below. -#define sadMxNxK(m, n, k) \ - void aom_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref_array, int ref_stride, \ - uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < k; ++i) \ - sad_array[i] = \ - aom_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \ +#define sadMxN(m, n) \ + unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride) { \ + return sad(src, src_stride, ref, ref_stride, m, n); \ + } \ + unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ + const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + uint8_t comp_pred[m * n]; \ + aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + return sad(src, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int aom_jnt_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_jnt_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ + return sad(src, src_stride, comp_pred, m, m, n); \ } -// This appears to be equivalent to the above when k == 4 and refs is const +// Calculate sad against 4 reference locations and store each in sad_array #define sadMxNx4D(m, n) \ void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ const uint8_t *const ref_array[], \ @@ -70,11 +73,8 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, } /* clang-format off */ -#if CONFIG_AV1 && CONFIG_EXT_PARTITION // 128x128 sadMxN(128, 128) -sadMxNxK(128, 128, 3) -sadMxNxK(128, 128, 8) sadMxNx4D(128, 128) // 128x64 @@ -84,12 +84,9 @@ sadMxNx4D(128, 64) // 64x128 sadMxN(64, 128) sadMxNx4D(64, 128) -#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION // 64x64 sadMxN(64, 64) -sadMxNxK(64, 64, 3) -sadMxNxK(64, 64, 8) sadMxNx4D(64, 64) // 64x32 @@ -102,8 +99,6 @@ sadMxNx4D(32, 64) // 32x32 sadMxN(32, 32) -sadMxNxK(32, 32, 3) -sadMxNxK(32, 32, 8) sadMxNx4D(32, 32) // 32x16 @@ -116,45 +111,39 @@ sadMxNx4D(16, 32) // 16x16 sadMxN(16, 16) -sadMxNxK(16, 16, 3) -sadMxNxK(16, 16, 8) sadMxNx4D(16, 16) // 16x8 sadMxN(16, 8) -sadMxNxK(16, 8, 3) -sadMxNxK(16, 8, 8) sadMxNx4D(16, 8) // 8x16 sadMxN(8, 16) -sadMxNxK(8, 16, 3) -sadMxNxK(8, 16, 8) sadMxNx4D(8, 16) // 8x8 sadMxN(8, 8) -sadMxNxK(8, 8, 3) -sadMxNxK(8, 8, 8) sadMxNx4D(8, 8) // 8x4 sadMxN(8, 4) -sadMxNxK(8, 4, 8) sadMxNx4D(8, 4) // 4x8 sadMxN(4, 8) -sadMxNxK(4, 8, 8) sadMxNx4D(4, 8) // 4x4 sadMxN(4, 4) -sadMxNxK(4, 4, 3) -sadMxNxK(4, 4, 8) sadMxNx4D(4, 4) -#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES +sadMxh(128); +sadMxh(64); +sadMxh(32); +sadMxh(16); +sadMxh(8); +sadMxh(4); + sadMxN(4, 16) sadMxNx4D(4, 16) sadMxN(16, 4) @@ -167,15 +156,10 @@ sadMxN(16, 64) sadMxNx4D(16, 64) sadMxN(64, 16) sadMxNx4D(64, 16) -sadMxN(32, 128) -sadMxNx4D(32, 128) -sadMxN(128, 32) -sadMxNx4D(128, 32) -#endif -/* clang-format on */ - -#if CONFIG_HIGHBITDEPTH - static INLINE + + /* clang-format on */ + + static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int y, x; @@ -216,19 +200,16 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ uint16_t comp_pred[m * n]; \ - aom_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \ + aom_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ + } \ + unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t comp_pred[m * n]; \ + aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, \ + ref_stride, jcp_param); \ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ - } - -#define highbd_sadMxNxK(m, n, k) \ - void aom_highbd_sad##m##x##n##x##k##_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref_array, \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < k; ++i) { \ - sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \ - &ref_array[i], ref_stride); \ - } \ } #define highbd_sadMxNx4D(m, n) \ @@ -243,11 +224,8 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, } /* clang-format off */ -#if CONFIG_AV1 && CONFIG_EXT_PARTITION // 128x128 highbd_sadMxN(128, 128) -highbd_sadMxNxK(128, 128, 3) -highbd_sadMxNxK(128, 128, 8) highbd_sadMxNx4D(128, 128) // 128x64 @@ -257,12 +235,9 @@ highbd_sadMxNx4D(128, 64) // 64x128 highbd_sadMxN(64, 128) highbd_sadMxNx4D(64, 128) -#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION // 64x64 highbd_sadMxN(64, 64) -highbd_sadMxNxK(64, 64, 3) -highbd_sadMxNxK(64, 64, 8) highbd_sadMxNx4D(64, 64) // 64x32 @@ -275,8 +250,6 @@ highbd_sadMxNx4D(32, 64) // 32x32 highbd_sadMxN(32, 32) -highbd_sadMxNxK(32, 32, 3) -highbd_sadMxNxK(32, 32, 8) highbd_sadMxNx4D(32, 32) // 32x16 @@ -289,45 +262,32 @@ highbd_sadMxNx4D(16, 32) // 16x16 highbd_sadMxN(16, 16) -highbd_sadMxNxK(16, 16, 3) -highbd_sadMxNxK(16, 16, 8) highbd_sadMxNx4D(16, 16) // 16x8 highbd_sadMxN(16, 8) -highbd_sadMxNxK(16, 8, 3) -highbd_sadMxNxK(16, 8, 8) highbd_sadMxNx4D(16, 8) // 8x16 highbd_sadMxN(8, 16) -highbd_sadMxNxK(8, 16, 3) -highbd_sadMxNxK(8, 16, 8) highbd_sadMxNx4D(8, 16) // 8x8 highbd_sadMxN(8, 8) -highbd_sadMxNxK(8, 8, 3) -highbd_sadMxNxK(8, 8, 8) highbd_sadMxNx4D(8, 8) // 8x4 highbd_sadMxN(8, 4) -highbd_sadMxNxK(8, 4, 8) highbd_sadMxNx4D(8, 4) // 4x8 highbd_sadMxN(4, 8) -highbd_sadMxNxK(4, 8, 8) highbd_sadMxNx4D(4, 8) // 4x4 highbd_sadMxN(4, 4) -highbd_sadMxNxK(4, 4, 3) -highbd_sadMxNxK(4, 4, 8) highbd_sadMxNx4D(4, 4) -#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES highbd_sadMxN(4, 16) highbd_sadMxNx4D(4, 16) highbd_sadMxN(16, 4) @@ -340,277 +300,4 @@ highbd_sadMxN(16, 64) highbd_sadMxNx4D(16, 64) highbd_sadMxN(64, 16) highbd_sadMxNx4D(64, 16) -highbd_sadMxN(32, 128) -highbd_sadMxNx4D(32, 128) -highbd_sadMxN(128, 32) -highbd_sadMxNx4D(128, 32) -#endif -/* clang-format on */ -#endif // CONFIG_HIGHBITDEPTH - -#if CONFIG_AV1 - static INLINE - unsigned int masked_sad(const uint8_t *src, int src_stride, - const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, const uint8_t *m, int m_stride, - int width, int height) { - int y, x; - unsigned int sad = 0; - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - const uint8_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); - sad += abs(pred - src[x]); - } - - src += src_stride; - a += a_stride; - b += b_stride; - m += m_stride; - } - sad = (sad + 31) >> 6; - - return sad; -} - -#define MASKSADMxN(m, n) \ - unsigned int aom_masked_sad##m##x##n##_c( \ - const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ - int invert_mask) { \ - if (!invert_mask) \ - return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \ - msk_stride, m, n); \ - else \ - return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \ - msk_stride, m, n); \ - } - -/* clang-format off */ -#if CONFIG_EXT_PARTITION -MASKSADMxN(128, 128) -MASKSADMxN(128, 64) -MASKSADMxN(64, 128) -#endif // CONFIG_EXT_PARTITION -MASKSADMxN(64, 64) -MASKSADMxN(64, 32) -MASKSADMxN(32, 64) -MASKSADMxN(32, 32) -MASKSADMxN(32, 16) -MASKSADMxN(16, 32) -MASKSADMxN(16, 16) -MASKSADMxN(16, 8) -MASKSADMxN(8, 16) -MASKSADMxN(8, 8) -MASKSADMxN(8, 4) -MASKSADMxN(4, 8) -MASKSADMxN(4, 4) - -#if CONFIG_EXT_PARTITION_TYPES -MASKSADMxN(4, 16) -MASKSADMxN(16, 4) -MASKSADMxN(8, 32) -MASKSADMxN(32, 8) -MASKSADMxN(16, 64) -MASKSADMxN(64, 16) -MASKSADMxN(32, 128) -MASKSADMxN(128, 32) -#endif -/* clang-format on */ - -#if CONFIG_HIGHBITDEPTH - static INLINE - unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride, - const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - const uint8_t *m, int m_stride, int width, - int height) { - int y, x; - unsigned int sad = 0; - const uint16_t *src = CONVERT_TO_SHORTPTR(src8); - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b = CONVERT_TO_SHORTPTR(b8); - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); - sad += abs(pred - src[x]); - } - - src += src_stride; - a += a_stride; - b += b_stride; - m += m_stride; - } - sad = (sad + 31) >> 6; - - return sad; -} - -#define HIGHBD_MASKSADMXN(m, n) \ - unsigned int aom_highbd_masked_sad##m##x##n##_c( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ - int msk_stride, int invert_mask) { \ - if (!invert_mask) \ - return highbd_masked_sad(src8, src_stride, ref8, ref_stride, \ - second_pred8, m, msk, msk_stride, m, n); \ - else \ - return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \ - ref_stride, msk, msk_stride, m, n); \ - } - -#if CONFIG_EXT_PARTITION -HIGHBD_MASKSADMXN(128, 128) -HIGHBD_MASKSADMXN(128, 64) -HIGHBD_MASKSADMXN(64, 128) -#endif // CONFIG_EXT_PARTITION -HIGHBD_MASKSADMXN(64, 64) -HIGHBD_MASKSADMXN(64, 32) -HIGHBD_MASKSADMXN(32, 64) -HIGHBD_MASKSADMXN(32, 32) -HIGHBD_MASKSADMXN(32, 16) -HIGHBD_MASKSADMXN(16, 32) -HIGHBD_MASKSADMXN(16, 16) -HIGHBD_MASKSADMXN(16, 8) -HIGHBD_MASKSADMXN(8, 16) -HIGHBD_MASKSADMXN(8, 8) -HIGHBD_MASKSADMXN(8, 4) -HIGHBD_MASKSADMXN(4, 8) -HIGHBD_MASKSADMXN(4, 4) - -#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES -HIGHBD_MASKSADMXN(4, 16) -HIGHBD_MASKSADMXN(16, 4) -HIGHBD_MASKSADMXN(8, 32) -HIGHBD_MASKSADMXN(32, 8) -HIGHBD_MASKSADMXN(16, 64) -HIGHBD_MASKSADMXN(64, 16) -HIGHBD_MASKSADMXN(32, 128) -HIGHBD_MASKSADMXN(128, 32) -#endif -#endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_AV1 - -#if CONFIG_AV1 && CONFIG_MOTION_VAR -// pre: predictor being evaluated -// wsrc: target weighted prediction (has been *4096 to keep precision) -// mask: 2d weights (scaled by 4096) -static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride, - const int32_t *wsrc, const int32_t *mask, - int width, int height) { - int y, x; - unsigned int sad = 0; - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); - - pre += pre_stride; - wsrc += width; - mask += width; - } - - return sad; -} - -#define OBMCSADMxN(m, n) \ - unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ - const int32_t *wsrc, \ - const int32_t *mask) { \ - return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ - } - -/* clang-format off */ -#if CONFIG_EXT_PARTITION -OBMCSADMxN(128, 128) -OBMCSADMxN(128, 64) -OBMCSADMxN(64, 128) -#endif // CONFIG_EXT_PARTITION -OBMCSADMxN(64, 64) -OBMCSADMxN(64, 32) -OBMCSADMxN(32, 64) -OBMCSADMxN(32, 32) -OBMCSADMxN(32, 16) -OBMCSADMxN(16, 32) -OBMCSADMxN(16, 16) -OBMCSADMxN(16, 8) -OBMCSADMxN(8, 16) -OBMCSADMxN(8, 8) -OBMCSADMxN(8, 4) -OBMCSADMxN(4, 8) -OBMCSADMxN(4, 4) - -#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES -OBMCSADMxN(4, 16) -OBMCSADMxN(16, 4) -OBMCSADMxN(8, 32) -OBMCSADMxN(32, 8) -OBMCSADMxN(16, 64) -OBMCSADMxN(64, 16) -OBMCSADMxN(32, 128) -OBMCSADMxN(128, 32) -#endif -/* clang-format on */ - -#if CONFIG_HIGHBITDEPTH - static INLINE - unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, - const int32_t *wsrc, const int32_t *mask, - int width, int height) { - int y, x; - unsigned int sad = 0; - const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); - - pre += pre_stride; - wsrc += width; - mask += width; - } - - return sad; -} - -#define HIGHBD_OBMCSADMXN(m, n) \ - unsigned int aom_highbd_obmc_sad##m##x##n##_c( \ - const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ - const int32_t *mask) { \ - return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ - } - -/* clang-format off */ -#if CONFIG_EXT_PARTITION -HIGHBD_OBMCSADMXN(128, 128) -HIGHBD_OBMCSADMXN(128, 64) -HIGHBD_OBMCSADMXN(64, 128) -#endif // CONFIG_EXT_PARTITION -HIGHBD_OBMCSADMXN(64, 64) -HIGHBD_OBMCSADMXN(64, 32) -HIGHBD_OBMCSADMXN(32, 64) -HIGHBD_OBMCSADMXN(32, 32) -HIGHBD_OBMCSADMXN(32, 16) -HIGHBD_OBMCSADMXN(16, 32) -HIGHBD_OBMCSADMXN(16, 16) -HIGHBD_OBMCSADMXN(16, 8) -HIGHBD_OBMCSADMXN(8, 16) -HIGHBD_OBMCSADMXN(8, 8) -HIGHBD_OBMCSADMXN(8, 4) -HIGHBD_OBMCSADMXN(4, 8) -HIGHBD_OBMCSADMXN(4, 4) - -#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES -HIGHBD_OBMCSADMXN(4, 16) -HIGHBD_OBMCSADMXN(16, 4) -HIGHBD_OBMCSADMXN(8, 32) -HIGHBD_OBMCSADMXN(32, 8) -HIGHBD_OBMCSADMXN(16, 64) -HIGHBD_OBMCSADMXN(64, 16) -HIGHBD_OBMCSADMXN(32, 128) -HIGHBD_OBMCSADMXN(128, 32) -#endif -/* clang-format on */ -#endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_AV1 && CONFIG_MOTION_VAR + /* clang-format on */ diff --git a/third_party/aom/aom_dsp/sad_av1.c b/third_party/aom/aom_dsp/sad_av1.c new file mode 100644 index 000000000..c176001d6 --- /dev/null +++ b/third_party/aom/aom_dsp/sad_av1.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/blend.h" + +static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride, + const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *m, int m_stride, int width, + int height) { + int y, x; + unsigned int sad = 0; + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); + sad += abs(pred - src[x]); + } + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + sad = (sad + 31) >> 6; + return sad; +} + +#define MASKSADMxN(m, n) \ + unsigned int aom_masked_sad##m##x##n##_c( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + if (!invert_mask) \ + return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \ + msk_stride, m, n); \ + else \ + return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \ + msk_stride, m, n); \ + } + +/* clang-format off */ +MASKSADMxN(128, 128) +MASKSADMxN(128, 64) +MASKSADMxN(64, 128) +MASKSADMxN(64, 64) +MASKSADMxN(64, 32) +MASKSADMxN(32, 64) +MASKSADMxN(32, 32) +MASKSADMxN(32, 16) +MASKSADMxN(16, 32) +MASKSADMxN(16, 16) +MASKSADMxN(16, 8) +MASKSADMxN(8, 16) +MASKSADMxN(8, 8) +MASKSADMxN(8, 4) +MASKSADMxN(4, 8) +MASKSADMxN(4, 4) +MASKSADMxN(4, 16) +MASKSADMxN(16, 4) +MASKSADMxN(8, 32) +MASKSADMxN(32, 8) +MASKSADMxN(16, 64) +MASKSADMxN(64, 16) + + /* clang-format on */ + + static INLINE + unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m, int m_stride, int width, + int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); + sad += abs(pred - src[x]); + } + + src += src_stride; + a += a_stride; + b += b_stride; + m += m_stride; + } + sad = (sad + 31) >> 6; + + return sad; +} + +#define HIGHBD_MASKSADMXN(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_c( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + if (!invert_mask) \ + return highbd_masked_sad(src8, src_stride, ref8, ref_stride, \ + second_pred8, m, msk, msk_stride, m, n); \ + else \ + return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \ + ref_stride, msk, msk_stride, m, n); \ + } + +HIGHBD_MASKSADMXN(128, 128) +HIGHBD_MASKSADMXN(128, 64) +HIGHBD_MASKSADMXN(64, 128) +HIGHBD_MASKSADMXN(64, 64) +HIGHBD_MASKSADMXN(64, 32) +HIGHBD_MASKSADMXN(32, 64) +HIGHBD_MASKSADMXN(32, 32) +HIGHBD_MASKSADMXN(32, 16) +HIGHBD_MASKSADMXN(16, 32) +HIGHBD_MASKSADMXN(16, 16) +HIGHBD_MASKSADMXN(16, 8) +HIGHBD_MASKSADMXN(8, 16) +HIGHBD_MASKSADMXN(8, 8) +HIGHBD_MASKSADMXN(8, 4) +HIGHBD_MASKSADMXN(4, 8) +HIGHBD_MASKSADMXN(4, 4) +HIGHBD_MASKSADMXN(4, 16) +HIGHBD_MASKSADMXN(16, 4) +HIGHBD_MASKSADMXN(8, 32) +HIGHBD_MASKSADMXN(32, 8) +HIGHBD_MASKSADMXN(16, 64) +HIGHBD_MASKSADMXN(64, 16) + +// pre: predictor being evaluated +// wsrc: target weighted prediction (has been *4096 to keep precision) +// mask: 2d weights (scaled by 4096) +static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); + + pre += pre_stride; + wsrc += width; + mask += width; + } + + return sad; +} + +#define OBMCSADMxN(m, n) \ + unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \ + const int32_t *wsrc, \ + const int32_t *mask) { \ + return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } + +/* clang-format off */ +OBMCSADMxN(128, 128) +OBMCSADMxN(128, 64) +OBMCSADMxN(64, 128) +OBMCSADMxN(64, 64) +OBMCSADMxN(64, 32) +OBMCSADMxN(32, 64) +OBMCSADMxN(32, 32) +OBMCSADMxN(32, 16) +OBMCSADMxN(16, 32) +OBMCSADMxN(16, 16) +OBMCSADMxN(16, 8) +OBMCSADMxN(8, 16) +OBMCSADMxN(8, 8) +OBMCSADMxN(8, 4) +OBMCSADMxN(4, 8) +OBMCSADMxN(4, 4) +OBMCSADMxN(4, 16) +OBMCSADMxN(16, 4) +OBMCSADMxN(8, 32) +OBMCSADMxN(32, 8) +OBMCSADMxN(16, 64) +OBMCSADMxN(64, 16) + /* clang-format on */ + + static INLINE + unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride, + const int32_t *wsrc, const int32_t *mask, + int width, int height) { + int y, x; + unsigned int sad = 0; + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12); + + pre += pre_stride; + wsrc += width; + mask += width; + } + + return sad; +} + +#define HIGHBD_OBMCSADMXN(m, n) \ + unsigned int aom_highbd_obmc_sad##m##x##n##_c( \ + const uint8_t *ref, int ref_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \ + } + +/* clang-format off */ +HIGHBD_OBMCSADMXN(128, 128) +HIGHBD_OBMCSADMXN(128, 64) +HIGHBD_OBMCSADMXN(64, 128) +HIGHBD_OBMCSADMXN(64, 64) +HIGHBD_OBMCSADMXN(64, 32) +HIGHBD_OBMCSADMXN(32, 64) +HIGHBD_OBMCSADMXN(32, 32) +HIGHBD_OBMCSADMXN(32, 16) +HIGHBD_OBMCSADMXN(16, 32) +HIGHBD_OBMCSADMXN(16, 16) +HIGHBD_OBMCSADMXN(16, 8) +HIGHBD_OBMCSADMXN(8, 16) +HIGHBD_OBMCSADMXN(8, 8) +HIGHBD_OBMCSADMXN(8, 4) +HIGHBD_OBMCSADMXN(4, 8) +HIGHBD_OBMCSADMXN(4, 4) +HIGHBD_OBMCSADMXN(4, 16) +HIGHBD_OBMCSADMXN(16, 4) +HIGHBD_OBMCSADMXN(8, 32) +HIGHBD_OBMCSADMXN(32, 8) +HIGHBD_OBMCSADMXN(16, 64) +HIGHBD_OBMCSADMXN(64, 16) +/* clang-format on */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h index 8f6509383..51a38a7e1 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h @@ -15,8 +15,9 @@ #include #include #include -#include "./v128_intrinsics_c.h" -#include "./v64_intrinsics.h" + +#include "aom_dsp/simd/v128_intrinsics_c.h" +#include "aom_dsp/simd/v64_intrinsics.h" /* Fallback to plain, unoptimised C. */ @@ -57,6 +58,7 @@ SIMD_INLINE v128 v128_zero() { return c_v128_zero(); } SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); } SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); } SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); } +SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); } typedef uint32_t sad128_internal; SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); } @@ -74,9 +76,15 @@ SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { return c_v128_ssd_u8_sum(s); } +SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { + return c_v128_dotp_su8(a, b); +} SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { return c_v128_dotp_s16(a, b); } +SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { + return c_v128_dotp_s32(a, b); +} SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); } SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); } @@ -86,8 +94,12 @@ SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); } SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); } SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); } +SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); } +SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); } SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); } SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); } +SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); } +SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); } SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); } SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); } SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); } @@ -96,6 +108,7 @@ SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); } SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); } SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); } SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); } +SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); } SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); } SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); } @@ -112,8 +125,16 @@ SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); } SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); } +SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); } +SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { + return c_v128_blend_8(a, b, c); +} + SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); } SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); } +SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { + return c_v128_rdavg_u16(a, b); +} SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); } SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); } SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); } @@ -121,6 +142,8 @@ SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); } SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); } SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); } SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); } +SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); } +SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); } SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); } SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); } @@ -168,6 +191,9 @@ SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { return c_v128_pack_s32_s16(a, b); } +SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { + return c_v128_pack_s32_u16(a, b); +} SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { return c_v128_pack_s16_u8(a, b); } @@ -203,6 +229,14 @@ SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { } SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); } +SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { + return c_v128_cmpgt_s32(a, b); +} +SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { + return c_v128_cmplt_s32(a, b); +} +SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); } + SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { return c_v128_shl_8(a, c); } @@ -230,6 +264,15 @@ SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { return c_v128_shr_s32(a, c); } +SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { + return c_v128_shl_64(a, c); +} +SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { + return c_v128_shr_u64(a, c); +} +SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { + return c_v128_shr_s64(a, c); +} SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { return c_v128_shr_n_byte(a, n); @@ -246,6 +289,9 @@ SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) { SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) { return c_v128_shl_n_32(a, n); } +SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) { + return c_v128_shl_n_64(a, n); +} SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) { return c_v128_shr_n_u8(a, n); } @@ -255,6 +301,9 @@ SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) { SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) { return c_v128_shr_n_u32(a, n); } +SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) { + return c_v128_shr_n_u64(a, n); +} SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) { return c_v128_shr_n_s8(a, n); } @@ -264,5 +313,32 @@ SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) { SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) { return c_v128_shr_n_s32(a, n); } +SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) { + return c_v128_shr_n_s64(a, n); +} + +typedef uint32_t sad128_internal_u16; +SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { + return c_v128_sad_u16_init(); +} +SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, + v128 b) { + return c_v128_sad_u16(s, a, b); +} +SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { + return c_v128_sad_u16_sum(s); +} + +typedef uint64_t ssd128_internal_s16; +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { + return c_v128_ssd_s16_init(); +} +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, + v128 b) { + return c_v128_ssd_s16(s, a, b); +} +SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { + return c_v128_ssd_s16_sum(s); +} #endif /* _V128_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h index 0377d4ce1..d4fec4237 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h @@ -13,7 +13,8 @@ #define _V128_INTRINSICS_H #include -#include "./v64_intrinsics_arm.h" + +#include "aom_dsp/simd/v64_intrinsics_arm.h" typedef int64x2_t v128; @@ -28,7 +29,7 @@ SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); } SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); } SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { - return vcombine_s64((uint64x1_t)b, (uint64x1_t)a); + return vcombine_s64((int64x1_t)b, (int64x1_t)a); } SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { @@ -83,22 +84,57 @@ SIMD_INLINE v128 v128_dup_32(uint32_t x) { return vreinterpretq_s64_u32(vdupq_n_u32(x)); } +SIMD_INLINE v128 v128_dup_64(uint64_t x) { + return vreinterpretq_s64_u64(vdupq_n_u64(x)); +} + +SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { + int16x8_t t1 = vmulq_s16( + vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))), + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b))))); + int16x8_t t2 = vmulq_s16( + vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))), + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b))))); +#if defined(__aarch64__) + return vaddlvq_s16(t1) + vaddlvq_s16(t2); +#else + int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2))); + return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t); +#endif +} + SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) + v64_dotp_s16(vget_low_s64(a), vget_low_s64(b)); } +SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { + int64x2_t t = vpaddlq_s32( + vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); + return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t); +} + SIMD_INLINE uint64_t v128_hadd_u8(v128 x) { +#if defined(__aarch64__) + return vaddlvq_u8(vreinterpretq_u8_s64(x)); +#else uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x)))); return vget_lane_s32( vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); +#endif } SIMD_INLINE v128 v128_padd_s16(v128 a) { return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a))); } -typedef struct { sad64_internal hi, lo; } sad128_internal; +SIMD_INLINE v128 v128_padd_u8(v128 a) { + return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a))); +} + +typedef struct { + sad64_internal hi, lo; +} sad128_internal; SIMD_INLINE sad128_internal v128_sad_u8_init() { sad128_internal s; @@ -117,14 +153,21 @@ SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { } SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { - return (uint32_t)(v64_sad_u8_sum(s.hi) + v64_sad_u8_sum(s.lo)); +#if defined(__aarch64__) + return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo); +#else + uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo))); + return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t)); +#endif } -typedef struct { ssd64_internal hi, lo; } ssd128_internal; +typedef struct { + ssd64_internal hi, lo; +} ssd128_internal; SIMD_INLINE ssd128_internal v128_ssd_u8_init() { ssd128_internal s; - s.hi = s.lo = (ssd64_internal)(uint64_t)0; + s.hi = s.lo = v64_ssd_u8_init(); return s; } @@ -154,6 +197,16 @@ SIMD_INLINE v128 v128_add_8(v128 x, v128 y) { vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); } +SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) { + return vreinterpretq_s64_u8( + vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); +} + +SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) { + return vreinterpretq_s64_s8( + vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); +} + SIMD_INLINE v128 v128_add_16(v128 x, v128 y) { return vreinterpretq_s64_s16( vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); @@ -169,6 +222,11 @@ SIMD_INLINE v128 v128_add_32(v128 x, v128 y) { vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y))); } +SIMD_INLINE v128 v128_add_64(v128 x, v128 y) { + return vreinterpretq_s64_u64( + vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y))); +} + SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) { return vreinterpretq_s64_u8( vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); @@ -204,6 +262,8 @@ SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) { vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); } +SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); } + SIMD_INLINE v128 v128_abs_s16(v128 x) { return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x))); } @@ -223,8 +283,16 @@ SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { } SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { +#if defined(__aarch64__) + return vreinterpretq_s64_s16(vuzp2q_s16( + vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)), + vreinterpret_s16_s64(vget_low_s64(b)))), + vreinterpretq_s16_s32( + vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))))); +#else return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)), v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b))); +#endif } SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { @@ -233,13 +301,32 @@ SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { } SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { +#if defined(__aarch64__) + int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)), + vreinterpret_s16_s64(vget_low_s64(b))); + int32x4_t t2 = + vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)); + return vreinterpretq_s64_s32(vpaddq_s32(t1, t2)); +#else return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)), v64_madd_s16(vget_low_s64(a), vget_low_s64(b))); +#endif } SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { +#if defined(__aarch64__) + int16x8_t t1 = vmulq_s16( + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))), + vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b)))); + int16x8_t t2 = vmulq_s16( + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))), + vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b)))); + return vreinterpretq_s64_s16( + vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2))); +#else return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)), v64_madd_us8(vget_low_s64(a), vget_low_s64(b))); +#endif } SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) { @@ -252,6 +339,11 @@ SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) { vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); } +SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) { + return vreinterpretq_s64_u16( + vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); +} + SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) { return vreinterpretq_s64_u16( vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); @@ -272,6 +364,26 @@ SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) { vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); } +SIMD_INLINE uint32_t v128_movemask_8(v128 a) { + a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0))); +#if defined(__aarch64__) + uint8x16_t m = + vandq_u8(vreinterpretq_u8_s64(a), + vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))); + return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8); +#else + uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8( + vandq_u8(vreinterpretq_u8_s64(a), + vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)))))); + return v64_u64(v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m))); +#endif +} + +SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { + c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0))); + return v128_or(v128_and(b, c), v128_andn(a, c)); +} + SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) { return vreinterpretq_s64_s8( vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); @@ -287,14 +399,34 @@ SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) { vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); } +SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) { + return vreinterpretq_s64_s32( + vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) { + return vreinterpretq_s64_s32( + vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u8( + vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); +#else uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); return vreinterpretq_s64_u8(r.val[0]); +#endif } SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u8( + vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); +#else uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); return vreinterpretq_s64_u8(r.val[1]); +#endif } SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) { @@ -303,13 +435,23 @@ SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) { } SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u16( + vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); +#else int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); return vreinterpretq_s64_s16(r.val[0]); +#endif } SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u16( + vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); +#else int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); return vreinterpretq_s64_s16(r.val[1]); +#endif } SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) { @@ -318,13 +460,23 @@ SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) { } SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u32( + vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); +#else int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); return vreinterpretq_s64_s32(r.val[0]); +#endif } SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u32( + vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); +#else int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); return vreinterpretq_s64_s32(r.val[1]); +#endif } SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) { @@ -333,47 +485,76 @@ SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) { } SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { - return v128_from_v64(vget_low_u64((uint64x2_t)a), - vget_low_u64((uint64x2_t)b)); + return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b)); } SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { - return v128_from_v64(vget_high_u64((uint64x2_t)a), - vget_high_u64((uint64x2_t)b)); + return v128_from_v64(vget_high_s64((int64x2_t)a), + vget_high_s64((int64x2_t)b)); } SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u8( + vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); +#else uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); return vreinterpretq_s64_u8(r.val[0]); +#endif } SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u8( + vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); +#else uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); return vreinterpretq_s64_u8(r.val[1]); +#endif } SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u16( + vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); +#else uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); return vreinterpretq_s64_u16(r.val[0]); +#endif } SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u16( + vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); +#else uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); return vreinterpretq_s64_u16(r.val[1]); +#endif } SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u32( + vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); +#else uint32x4x2_t r = vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); return vreinterpretq_s64_u32(r.val[0]); +#endif } SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) { +#if defined(__aarch64__) + return vreinterpretq_s64_u32( + vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); +#else uint32x4x2_t r = vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); return vreinterpretq_s64_u32(r.val[1]); +#endif } SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { @@ -406,6 +587,12 @@ SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b)))); } +SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { + return v128_from_v64( + vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))), + vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b)))); +} + SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { return v128_from_v64( vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))), @@ -447,15 +634,17 @@ SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { } SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { - return v128_from_64( - (uint64_t)vreinterpret_s64_u8( - vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)), - vget_high_u8(vreinterpretq_u8_s64(x)) } }, - vreinterpret_u8_s64(vget_high_s64(pattern)))), - (uint64_t)vreinterpret_s64_u8( - vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)), - vget_high_u8(vreinterpretq_u8_s64(x)) } }, - vreinterpret_u8_s64(vget_low_s64(pattern))))); +#if defined(__aarch64__) + return vreinterpretq_s64_u8( + vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern))); +#else + uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)), + vget_high_u8(vreinterpretq_u8_s64(x)) } }; + return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8( + p, vreinterpret_u8_s64(vget_high_s64(pattern)))), + (uint64_t)vreinterpret_s64_u8(vtbl2_u8( + p, vreinterpret_u8_s64(vget_low_s64(pattern))))); +#endif } SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) { @@ -488,19 +677,37 @@ SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) { vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); } +SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) { + return vreinterpretq_s64_u32( + vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) { + return vreinterpretq_s64_u32( + vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + +SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) { + return vreinterpretq_s64_u32( + vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); +} + SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { - return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8( - vreinterpretq_u8_s64(a), vdupq_n_s8(c))); + return (c > 7) ? v128_zero() + : vreinterpretq_s64_u8( + vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c))); } SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { - return (c > 7) ? v128_zero() : vreinterpretq_s64_u8(vshlq_u8( - vreinterpretq_u8_s64(a), vdupq_n_s8(-c))); + return (c > 7) ? v128_zero() + : vreinterpretq_s64_u8( + vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c))); } SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { - return (c > 7) ? v128_ones() : vreinterpretq_s64_s8(vshlq_s8( - vreinterpretq_s8_s64(a), vdupq_n_s8(-c))); + return (c > 7) ? v128_ones() + : vreinterpretq_s64_s8( + vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c))); } SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { @@ -539,6 +746,22 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c))); } +SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { + return (c > 63) ? v128_zero() + : vreinterpretq_s64_u64( + vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c))); +} + +SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { + return (c > 63) ? v128_zero() + : vreinterpretq_s64_u64( + vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c))); +} + +SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { + return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c)); +} + #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { @@ -562,16 +785,18 @@ SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { return n < 8 ? v128_from_64( - vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), n * 8), - vorr_u64( + (uint64_t)vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), + n * 8), + (uint64_t)vorr_u64( vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8), vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), (8 - n) * 8))) - : (n == 8 - ? v128_from_64(0, vreinterpret_u64_s64(vget_high_s64(a))) - : v128_from_64( - 0, vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), - (n - 8) * 8))); + : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64( + vget_high_s64(a))) + : v128_from_64( + 0, (uint64_t)vshr_n_u64( + vreinterpret_u64_s64(vget_high_s64(a)), + (n - 8) * 8))); } SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { @@ -610,6 +835,18 @@ SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c)); } +SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) { + return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) { + return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c)); +} + +SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) { + return vshrq_n_s64(a, c); +} + #else SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { @@ -666,6 +903,55 @@ SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { return v128_shr_s32(a, c); } +SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) { + return v128_shl_64(a, c); +} + +SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) { + return v128_shr_u64(a, c); +} + +SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) { + return v128_shr_s64(a, c); +} + #endif +typedef uint32x4_t sad128_internal_u16; + +SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); } + +/* Implementation dependent return value. Result must be finalised with + * v128_sad_u16_sum(). */ +SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, + v128 b) { + return vaddq_u32( + s, vpaddlq_u16(vsubq_u16( + vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)), + vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b))))); +} + +SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { + uint64x2_t t = vpaddlq_u32(s); + return (uint32_t)(uint64_t)vget_high_u64(t) + + (uint32_t)(uint64_t)vget_low_u64(t); +} + +typedef v128 ssd128_internal_s16; +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_s16_sum(). */ +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, + v128 b) { + v128 d = v128_sub_16(a, b); + d = v128_madd_s16(d, d); + return v128_add_64( + s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d)))); +} + +SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { + return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); +} + #endif /* _V128_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h index 32e7c32de..e508f6ad7 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h @@ -14,8 +14,10 @@ #include #include -#include "./v64_intrinsics_c.h" -#include "./aom_config.h" + +#include "config/aom_config.h" + +#include "aom_dsp/simd/v64_intrinsics_c.h" typedef union { uint8_t u8[16]; @@ -115,11 +117,30 @@ SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) { return t; } +SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) { + c_v128 t; + t.u64[1] = t.u64[0] = x; + return t; +} + +SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) { + return c_v64_dotp_su8(a.v64[1], b.v64[1]) + + c_v64_dotp_su8(a.v64[0], b.v64[0]); +} + SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) { return c_v64_dotp_s16(a.v64[1], b.v64[1]) + c_v64_dotp_s16(a.v64[0], b.v64[0]); } +SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) { + // 32 bit products, 64 bit sum + return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) + + (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) + + (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) + + (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]); +} + SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) { return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]); } @@ -186,6 +207,16 @@ SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) { c_v64_add_16(a.v64[0], b.v64[0])); } +SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]), + c_v64_sadd_u8(a.v64[0], b.v64[0])); +} + +SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]), + c_v64_sadd_s8(a.v64[0], b.v64[0])); +} + SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]), c_v64_sadd_s16(a.v64[0], b.v64[0])); @@ -196,6 +227,15 @@ SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) { c_v64_add_32(a.v64[0], b.v64[0])); } +SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) { + // Two complement overflow (silences sanitizers) + return c_v128_from_64( + a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1 + : a.v64[1].u64 + b.v64[1].u64, + a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1 + : a.v64[0].u64 + b.v64[0].u64); +} + SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) { c_v128 t; t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; @@ -205,6 +245,19 @@ SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) { return t; } +SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) { + c_v128 t; + t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1]; + t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3]; + t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5]; + t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7]; + t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9]; + t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11]; + t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13]; + t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15]; + return t; +} + SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]), c_v64_sub_8(a.v64[0], b.v64[0])); @@ -240,6 +293,15 @@ SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) { c_v64_sub_32(a.v64[0], b.v64[0])); } +SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) { + // Two complement underflow (silences sanitizers) + return c_v128_from_64( + a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1 + : a.v64[1].u64 - b.v64[1].u64, + a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1 + : a.v64[0].u64 - b.v64[0].u64); +} + SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) { return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0])); } @@ -290,6 +352,11 @@ SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) { c_v64_rdavg_u8(a.v64[0], b.v64[0])); } +SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]), + c_v64_rdavg_u16(a.v64[0], b.v64[0])); +} + SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]), c_v64_avg_u16(a.v64[0], b.v64[0])); @@ -310,6 +377,22 @@ SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) { c_v64_min_s8(a.v64[0], b.v64[0])); } +SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) { + return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | + ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | + ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | + ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | + ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | + ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | + ((a.s8[0] < 0) << 0); +} + +SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) { + c_v128 t; + for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; + return t; +} + SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]), c_v64_max_s8(a.v64[0], b.v64[0])); @@ -325,6 +408,20 @@ SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) { c_v64_max_s16(a.v64[0], b.v64[0])); } +SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c]; + return t; +} + +SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c]; + return t; +} + SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]), c_v64_ziplo_8(a.v64[0], b.v64[0])); @@ -518,6 +615,11 @@ SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) { c_v64_pack_s32_s16(b.v64[1], b.v64[0])); } +SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) { + return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]), + c_v64_pack_s32_u16(b.v64[1], b.v64[0])); +} + SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) { return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]), c_v64_pack_s16_u8(b.v64[1], b.v64[0])); @@ -559,15 +661,10 @@ SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) { SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) { c_v128 t; int c; - for (c = 0; c < 16; c++) { - if (pattern.u8[c] & ~15) { - fprintf(stderr, "Undefined v128_shuffle_8 index %d/%d\n", pattern.u8[c], - c); - abort(); - } + for (c = 0; c < 16; c++) t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15) : pattern.u8[c] & 15]; - } + return t; } @@ -601,7 +698,28 @@ SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) { c_v64_cmpeq_16(a.v64[0], b.v64[0])); } -SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]); + return t; +} + +SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]); + return t; +} + +SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) { + c_v128 t; + int c; + for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]); + return t; +} + +SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) { if (n < 8) return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n), c_v64_shr_n_byte(a.v64[0], 8 - n)), @@ -610,7 +728,7 @@ SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, unsigned int n) { return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero()); } -SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) { if (n < 8) return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n), c_v64_or(c_v64_shr_n_byte(a.v64[0], n), @@ -619,7 +737,7 @@ SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, unsigned int n) { return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8)); } -SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, unsigned int c) { +SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) { if (SIMD_CHECK && c > 15) { fprintf(stderr, "Error: undefined alignment %d\n", c); abort(); @@ -628,80 +746,143 @@ SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, unsigned int c) { : b; } -SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, unsigned int c) { +SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c)); } -SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, unsigned int c) { +SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c)); } -SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, unsigned int c) { +SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c)); } -SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, unsigned int c) { +SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c)); } -SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, unsigned int c) { +SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c), c_v64_shr_u16(a.v64[0], c)); } -SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, unsigned int c) { +SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c), c_v64_shr_s16(a.v64[0], c)); } -SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, unsigned int c) { +SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c)); } -SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, unsigned int c) { +SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c), c_v64_shr_u32(a.v64[0], c)); } -SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, unsigned int c) { +SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) { return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c), c_v64_shr_s32(a.v64[0], c)); } -SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) { + a.v64[1].u64 <<= c; + a.v64[0].u64 <<= c; + return c_v128_from_v64(a.v64[1], a.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) { + a.v64[1].u64 >>= c; + a.v64[0].u64 >>= c; + return c_v128_from_v64(a.v64[1], a.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) { + a.v64[1].s64 >>= c; + a.v64[0].s64 >>= c; + return c_v128_from_v64(a.v64[1], a.v64[0]); +} + +SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) { return c_v128_shl_8(a, n); } -SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) { return c_v128_shl_16(a, n); } -SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) { return c_v128_shl_32(a, n); } -SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) { + return c_v128_shl_64(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) { return c_v128_shr_u8(a, n); } -SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) { return c_v128_shr_u16(a, n); } -SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) { return c_v128_shr_u32(a, n); } -SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) { + return c_v128_shr_u64(a, n); +} + +SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) { return c_v128_shr_s8(a, n); } -SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) { return c_v128_shr_s16(a, n); } -SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, unsigned int n) { +SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) { return c_v128_shr_s32(a, n); } +SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) { + return c_v128_shr_s64(a, n); +} + +typedef uint32_t c_sad128_internal_u16; + +SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_sad_u16_sum(). */ +SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s, + c_v128 a, c_v128 b) { + int c; + for (c = 0; c < 8; c++) + s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; + return s; +} + +SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; } + +typedef uint64_t c_ssd128_internal_s16; + +SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_s16_sum(). */ +SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s, + c_v128 a, c_v128 b) { + int c; + for (c = 0; c < 8; c++) + s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * + (int32_t)(int16_t)(a.s16[c] - b.s16[c]); + return s; +} + +SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; } + #endif /* _V128_INTRINSICS_C_H */ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h index cca1788d5..f9043fe99 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h @@ -12,7 +12,8 @@ #ifndef _V128_INTRINSICS_H #define _V128_INTRINSICS_H -#include "./v64_intrinsics_x86.h" +#include +#include "aom_dsp/simd/v64_intrinsics_x86.h" typedef __m128i v128; @@ -62,7 +63,7 @@ SIMD_INLINE void v128_store_unaligned(void *p, v128 a) { // Some compilers will check this during optimisation, others wont. #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) #if defined(__SSSE3__) -SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { +SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) { return c ? _mm_alignr_epi8(a, b, c) : b; } #else @@ -71,7 +72,7 @@ SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { #endif #else #if defined(__SSSE3__) -#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, c) : (b)) +#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b)) #else #define v128_align(a, b, c) \ ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b)) @@ -86,14 +87,25 @@ SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); } SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); } +SIMD_INLINE v128 v128_dup_64(uint64_t x) { + // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers + return _mm_set_epi32(x >> 32, (uint32_t)x, x >> 32, (uint32_t)x); +} + SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); } SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); } +SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); } + +SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); } + SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); } SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); } +SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); } + SIMD_INLINE v128 v128_padd_s16(v128 a) { return _mm_madd_epi16(a, _mm_set1_epi16(1)); } @@ -112,6 +124,8 @@ SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); } SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); } +SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); } + SIMD_INLINE v128 v128_abs_s16(v128 a) { #if defined(__SSSE3__) return _mm_abs_epi16(a); @@ -241,6 +255,15 @@ SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { return _mm_packs_epi32(b, a); } +SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_packus_epi32(b, a); +#else + return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)), + v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b))); +#endif +} + SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { return _mm_packus_epi16(b, a); } @@ -291,6 +314,15 @@ SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { #endif } +SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { + v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b)); + v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b)); + v128 t = v128_add_32(t1, t2); + t = v128_add_32(t, _mm_srli_si128(t, 8)); + t = v128_add_32(t, _mm_srli_si128(t, 4)); + return (int32_t)v128_low_u32(t); +} + SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { v128 r = _mm_madd_epi16(a, b); #if defined(__SSE4_1__) && defined(__x86_64__) @@ -325,31 +357,25 @@ SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); } -typedef v128 ssd128_internal; +typedef int32_t ssd128_internal; -SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return _mm_setzero_si128(); } +SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; } /* Implementation dependent return value. Result must be finalised with * v128_ssd_sum(). */ SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { - v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()), - _mm_unpacklo_epi8(b, _mm_setzero_si128())); - v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()), - _mm_unpackhi_epi8(b, _mm_setzero_si128())); + v128 z = _mm_setzero_si128(); + v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z)); + v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z)); v128 rl = _mm_madd_epi16(l, l); v128 rh = _mm_madd_epi16(h, h); - v128 c = _mm_cvtsi32_si128(32); - rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 8)); - rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 4)); - rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 8)); - rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 4)); - return _mm_add_epi64( - s, _mm_srl_epi64(_mm_sll_epi64(_mm_unpacklo_epi64(rl, rh), c), c)); + v128 r = _mm_add_epi32(rl, rh); + r = _mm_add_epi32(r, _mm_srli_si128(r, 8)); + r = _mm_add_epi32(r, _mm_srli_si128(r, 4)); + return s + _mm_cvtsi128_si32(r); } -SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { - return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s))); -} +SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; } SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); } @@ -385,6 +411,14 @@ SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { #endif } +SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { + v128 r = v128_mullo_s32(a, b); + return (int64_t)_mm_cvtsi128_si32(r) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12)); +} + SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); } SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { @@ -399,6 +433,10 @@ SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { #endif } +SIMD_INLINE v128 v128_padd_u8(v128 a) { + return v128_madd_us8(a, _mm_set1_epi8(1)); +} + SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); } SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { @@ -406,6 +444,11 @@ SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1))); } +SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) { + return _mm_sub_epi16(_mm_avg_epu16(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1))); +} + SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); } SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); } @@ -421,6 +464,17 @@ SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { #endif } +SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); } + +SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { +#if defined(__SSE4_1__) + return _mm_blendv_epi8(a, b, c); +#else + c = _mm_cmplt_epi8(c, v128_zero()); + return v128_or(v128_and(b, c), v128_andn(a, c)); +#endif +} + SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { #if defined(__SSE4_1__) return _mm_max_epi8(a, b); @@ -434,6 +488,24 @@ SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); } SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); } +SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_min_epi32(a, b); +#else + v128 mask = _mm_cmplt_epi32(a, b); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + +SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { +#if defined(__SSE4_1__) + return _mm_max_epi32(a, b); +#else + v128 mask = _mm_cmplt_epi32(b, a); + return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a)); +#endif +} + SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); } SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); } @@ -448,6 +520,16 @@ SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) { return _mm_cmplt_epi16(a, b); } +SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); } + +SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) { + return _mm_cmpgt_epi32(a, b); +} + +SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) { + return _mm_cmplt_epi32(a, b); +} + SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); } SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { @@ -490,10 +572,25 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { return _mm_sra_epi32(a, _mm_cvtsi32_si128(c)); } +SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { + return _mm_sll_epi64(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { + return _mm_srl_epi64(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { + // _mm_sra_epi64 is missing in gcc? + return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c, + (int64_t)v64_u64(v128_low_v64(a)) >> c); + // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c)); +} + /* These intrinsics require immediate values, so we must use #defines to enforce that. */ -#define v128_shl_n_byte(a, c) _mm_slli_si128(a, c) -#define v128_shr_n_byte(a, c) _mm_srli_si128(a, c) +#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127) +#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127) #define v128_shl_n_8(a, c) \ _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c)) #define v128_shr_n_u8(a, c) \ @@ -507,5 +604,53 @@ SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { #define v128_shl_n_32(a, c) _mm_slli_epi32(a, c) #define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c) #define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c) +#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c) +#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c) +#define v128_shr_n_s64(a, c) \ + v128_shr_s64(a, c) // _mm_srai_epi64 missing in gcc? + +typedef v128 sad128_internal_u16; + +SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v128_sad_u16_sum(). */ +SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, + v128 b) { +#if defined(__SSE4_1__) + v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b)); +#else + v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)), + v128_xor(b, v128_dup_16(32768))); + t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)), + v128_or(v128_and(a, t), v128_andn(b, t))); +#endif + return v128_add_32( + s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t))); +} + +SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { + return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) + + v128_low_u32(v128_shr_n_byte(s, 8)) + + v128_low_u32(v128_shr_n_byte(s, 12)); +} + +typedef v128 ssd128_internal_s16; + +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v128_ssd_s16_sum(). */ +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, + v128 b) { + v128 d = v128_sub_16(a, b); + d = v128_madd_s16(d, d); + return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()), + _mm_unpacklo_epi32(d, v128_zero()))); +} + +SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { + return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); +} #endif /* _V128_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h index 1896374ee..0e5ae5b68 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h @@ -15,9 +15,10 @@ #include #include #include -#include "./v256_intrinsics_c.h" -#include "./v128_intrinsics.h" -#include "./v64_intrinsics.h" + +#include "aom_dsp/simd/v256_intrinsics_c.h" +#include "aom_dsp/simd/v128_intrinsics.h" +#include "aom_dsp/simd/v64_intrinsics.h" /* Fallback to plain, unoptimised C. */ @@ -25,6 +26,7 @@ typedef c_v256 v256; SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); } SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); } +SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); } SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); } SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); } SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { @@ -59,6 +61,7 @@ SIMD_INLINE v256 v256_zero() { return c_v256_zero(); } SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); } SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); } SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); } +SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); } typedef uint32_t sad256_internal; SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); } @@ -76,9 +79,16 @@ SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { return c_v256_ssd_u8_sum(s); } + +SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { + return c_v256_dotp_su8(a, b); +} SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { return c_v256_dotp_s16(a, b); } +SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { + return c_v256_dotp_s32(a, b); +} SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); } SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); } @@ -88,8 +98,13 @@ SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); } SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); } SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); } +SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); } +SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); } SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); } SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); } +SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); } +SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); } +SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); } SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); } SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); } SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); } @@ -114,8 +129,16 @@ SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); } SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); } +SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); } +SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { + return c_v256_blend_8(a, b, c); +} + SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); } SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); } +SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { + return c_v256_rdavg_u16(a, b); +} SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); } SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); } SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); } @@ -123,6 +146,8 @@ SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); } SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); } SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); } SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); } +SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); } +SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); } SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); } SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); } @@ -159,6 +184,12 @@ SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { return c_v256_unziphi_32(a, b); } +SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { + return c_v256_unziplo_64(a, b); +} +SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { + return c_v256_unziphi_64(a, b); +} SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); } SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { return c_v256_unpacklo_u8_s16(a); @@ -176,6 +207,9 @@ SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { return c_v256_pack_s32_s16(a, b); } +SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { + return c_v256_pack_s32_u16(a, b); +} SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { return c_v256_pack_s16_u8(a, b); } @@ -203,6 +237,9 @@ SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { return c_v256_shuffle_8(a, pattern); } +SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { + return c_v256_wideshuffle_8(a, b, pattern); +} SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { return c_v256_pshuffle_8(a, pattern); } @@ -217,7 +254,14 @@ SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { return c_v256_cmplt_s16(a, b); } SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); } +SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); } +SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { + return c_v256_cmpgt_s32(a, b); +} +SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { + return c_v256_cmplt_s32(a, b); +} SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { return c_v256_shl_8(a, c); } @@ -261,6 +305,9 @@ SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) { SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) { return c_v256_shl_n_32(a, n); } +SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) { + return c_v256_shl_n_64(a, n); +} SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) { return c_v256_shr_n_u8(a, n); } @@ -270,6 +317,9 @@ SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) { SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) { return c_v256_shr_n_u32(a, n); } +SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) { + return c_v256_shr_n_u64(a, n); +} SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) { return c_v256_shr_n_s8(a, n); } @@ -279,5 +329,39 @@ SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) { SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) { return c_v256_shr_n_s32(a, n); } +SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) { + return c_v256_shr_n_s64(a, n); +} + +SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) { + return c_v256_shr_n_word(a, n); +} +SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) { + return c_v256_shl_n_word(a, n); +} + +typedef uint32_t sad256_internal_u16; +SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { + return c_v256_sad_u16_init(); +} +SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, + v256 b) { + return c_v256_sad_u16(s, a, b); +} +SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { + return c_v256_sad_u16_sum(s); +} + +typedef uint64_t ssd256_internal_s16; +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { + return c_v256_ssd_s16_init(); +} +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, + v256 b) { + return c_v256_ssd_s16(s, a, b); +} +SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { + return c_v256_ssd_s16_sum(s); +} #endif /* _V256_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h index ba4ed719d..d96638488 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h @@ -12,6 +12,6 @@ #ifndef _V256_INTRINSICS_H #define _V256_INTRINSICS_H -#include "./v256_intrinsics_v128.h" +#include "aom_dsp/simd/v256_intrinsics_v128.h" #endif /* _V256_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h index f96ca7fa6..5b412df71 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h @@ -14,8 +14,10 @@ #include #include -#include "./v128_intrinsics_c.h" -#include "./aom_config.h" + +#include "config/aom_config.h" + +#include "aom_dsp/simd/v128_intrinsics_c.h" typedef union { uint8_t u8[32]; @@ -34,6 +36,8 @@ SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; } SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; } +SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; } + SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; } SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; } @@ -120,23 +124,39 @@ SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) { return t; } +SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) { + c_v256 t; + t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x; + return t; +} + +SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) { + return c_v128_dotp_su8(a.v128[1], b.v128[1]) + + c_v128_dotp_su8(a.v128[0], b.v128[0]); +} + SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) { return c_v128_dotp_s16(a.v128[1], b.v128[1]) + c_v128_dotp_s16(a.v128[0], b.v128[0]); } +SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) { + return c_v128_dotp_s32(a.v128[1], b.v128[1]) + + c_v128_dotp_s32(a.v128[0], b.v128[0]); +} + SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) { return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]); } typedef uint32_t c_sad256_internal; -SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; } +SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; } /* Implementation dependent return value. Result must be finalised with v256_sad_u8_sum(). The result for more than 16 v256_sad_u8() calls is undefined. */ -SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a, +SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a, c_v256 b) { int c; for (c = 0; c < 32; c++) @@ -191,6 +211,16 @@ SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) { c_v128_add_16(a.v128[0], b.v128[0])); } +SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]), + c_v128_sadd_s8(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]), + c_v128_sadd_u8(a.v128[0], b.v128[0])); +} + SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]), c_v128_sadd_s16(a.v128[0], b.v128[0])); @@ -201,6 +231,23 @@ SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) { c_v128_add_32(a.v128[0], b.v128[0])); } +SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]), + c_v128_add_64(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]), + c_v128_sub_64(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) { + c_v256 t; + for (int i = 0; i < 16; i++) + t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1]; + return t; +} + SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) { c_v256 t; t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1]; @@ -299,6 +346,11 @@ SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) { c_v128_rdavg_u8(a.v128[0], b.v128[0])); } +SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]), + c_v128_rdavg_u16(a.v128[0], b.v128[0])); +} + SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]), c_v128_avg_u16(a.v128[0], b.v128[0])); @@ -319,6 +371,30 @@ SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) { c_v128_min_s8(a.v128[0], b.v128[0])); } +SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) { + return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) | + ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) | + ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) | + ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) | + ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) | + ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) | + ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) | + ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) | + ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) | + ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) | + ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) | + ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) | + ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) | + ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) | + ((a.s8[0] < 0) << 0); +} + +SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) { + c_v256 t; + for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i]; + return t; +} + SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]), c_v128_max_s8(a.v128[0], b.v128[0])); @@ -334,6 +410,16 @@ SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) { c_v128_max_s16(a.v128[0], b.v128[0])); } +SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]), + c_v128_min_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]), + c_v128_max_s32(a.v128[0], b.v128[0])); +} + SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]), c_v128_ziplo_8(a.v128[0], b.v128[0])); @@ -482,6 +568,32 @@ SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) { : _c_v256_unzip_32(b, a, 1); } +SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) { + c_v256 t; + if (mode) { + t.u64[3] = b.u64[3]; + t.u64[2] = b.u64[1]; + t.u64[1] = a.u64[3]; + t.u64[0] = a.u64[1]; + } else { + t.u64[3] = a.u64[2]; + t.u64[2] = a.u64[0]; + t.u64[1] = b.u64[2]; + t.u64[0] = b.u64[0]; + } + return t; +} + +SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1) + : _c_v256_unzip_64(a, b, 0); +} + +SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) { + return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0) + : _c_v256_unzip_64(b, a, 1); +} + SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) { return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a)); } @@ -515,6 +627,11 @@ SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) { c_v128_pack_s32_s16(b.v128[1], b.v128[0])); } +SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]), + c_v128_pack_s32_u16(b.v128[1], b.v128[0])); +} + SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) { return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]), c_v128_pack_s16_u8(b.v128[1], b.v128[0])); @@ -558,15 +675,21 @@ SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) { SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) { c_v256 t; int c; - for (c = 0; c < 32; c++) { - if (pattern.u8[c] & ~31) { - fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c], - c); - abort(); - } + for (c = 0; c < 32; c++) t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) : pattern.u8[c] & 31]; - } + + return t; +} + +SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) { + c_v256 t; + int c; + for (c = 0; c < 32; c++) + t.u8[c] = (pattern.u8[c] < 32 + ? b.u8 + : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31) + : pattern.u8[c] & 31]; return t; } @@ -607,6 +730,21 @@ SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) { c_v128_cmpeq_16(a.v128[0], b.v128[0])); } +SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]), + c_v128_cmpgt_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]), + c_v128_cmplt_s32(a.v128[0], b.v128[0])); +} + +SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) { + return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]), + c_v128_cmpeq_32(a.v128[0], b.v128[0])); +} + SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) { if (n < 16) return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n), @@ -685,6 +823,45 @@ SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) { c_v128_shr_s32(a.v128[0], c)); } +SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) { + c_v256 t; + if (SIMD_CHECK && n > 63) { + fprintf(stderr, "Error: undefined s64 shift right %d\n", n); + abort(); + } + t.s64[3] = a.s64[3] >> n; + t.s64[2] = a.s64[2] >> n; + t.s64[1] = a.s64[1] >> n; + t.s64[0] = a.s64[0] >> n; + return t; +} + +SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) { + c_v256 t; + if (SIMD_CHECK && n > 63) { + fprintf(stderr, "Error: undefined s64 shift right %d\n", n); + abort(); + } + t.u64[3] = a.u64[3] >> n; + t.u64[2] = a.u64[2] >> n; + t.u64[1] = a.u64[1] >> n; + t.u64[0] = a.u64[0] >> n; + return t; +} + +SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) { + c_v256 t; + if (SIMD_CHECK && n > 63) { + fprintf(stderr, "Error: undefined s64 shift right %d\n", n); + abort(); + } + t.u64[3] = a.u64[3] << n; + t.u64[2] = a.u64[2] << n; + t.u64[1] = a.u64[1] << n; + t.u64[0] = a.u64[0] << n; + return t; +} + SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) { return c_v256_shl_8(a, n); } @@ -697,6 +874,10 @@ SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) { return c_v256_shl_32(a, n); } +SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) { + return c_v256_shl_64(a, n); +} + SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) { return c_v256_shr_u8(a, n); } @@ -709,6 +890,10 @@ SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) { return c_v256_shr_u32(a, n); } +SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) { + return c_v256_shr_u64(a, n); +} + SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) { return c_v256_shr_s8(a, n); } @@ -721,4 +906,48 @@ SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) { return c_v256_shr_s32(a, n); } +SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) { + return c_v256_shr_s64(a, n); +} + +SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) { + return c_v256_shr_n_byte(a, 2 * n); +} +SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) { + return c_v256_shl_n_byte(a, 2 * n); +} + +typedef uint32_t c_sad256_internal_u16; + +SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u16_sum(). */ +SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s, + c_v256 a, c_v256 b) { + int c; + for (c = 0; c < 16; c++) + s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c]; + return s; +} + +SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; } + +typedef uint64_t c_ssd256_internal_s16; + +SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; } + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_s16_sum(). */ +SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s, + c_v256 a, c_v256 b) { + int c; + for (c = 0; c < 16; c++) + s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) * + (int32_t)(int16_t)(a.s16[c] - b.s16[c]); + return s; +} + +SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; } + #endif /* _V256_INTRINSICS_C_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h index cbea55ca1..60b2a1791 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h @@ -13,27 +13,35 @@ #define _V256_INTRINSICS_V128_H #if HAVE_NEON -#include "./v128_intrinsics_arm.h" +#include "aom_dsp/simd/v128_intrinsics_arm.h" #elif HAVE_SSE2 -#include "./v128_intrinsics_x86.h" +#include "aom_dsp/simd/v128_intrinsics_x86.h" #else -#include "./v128_intrinsics.h" +#include "aom_dsp/simd/v128_intrinsics.h" #endif -typedef struct { v128 lo, hi; } v256; +#if HAVE_NEON +typedef int64x2x2_t v256; +#else +typedef struct { + v128 val[2]; +} v256; +#endif -SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); } +SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); } -SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); } +SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); } -SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; } +SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } -SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; } +SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; } + +SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; } SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) { v256 t; - t.hi = hi; - t.lo = lo; + t.val[1] = hi; + t.val[0] = lo; return t; } @@ -56,13 +64,13 @@ SIMD_INLINE v256 v256_load_aligned(const void *p) { } SIMD_INLINE void v256_store_unaligned(void *p, v256 a) { - v128_store_unaligned(p, a.lo); - v128_store_unaligned((uint8_t *)p + 16, a.hi); + v128_store_unaligned(p, a.val[0]); + v128_store_unaligned((uint8_t *)p + 16, a.val[1]); } SIMD_INLINE void v256_store_aligned(void *p, v256 a) { - v128_store_aligned(p, a.lo); - v128_store_aligned((uint8_t *)p + 16, a.hi); + v128_store_aligned(p, a.val[0]); + v128_store_aligned((uint8_t *)p + 16, a.val[1]); } SIMD_INLINE v256 v256_zero() { @@ -84,23 +92,35 @@ SIMD_INLINE v256 v256_dup_32(uint32_t x) { return v256_from_v128(t, t); } +SIMD_INLINE v256 v256_dup_64(uint64_t x) { + v128 t = v128_dup_64(x); + return v256_from_v128(t, t); +} + +SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { + return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]); +} + SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { - return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo); + return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]); +} + +SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { + return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]); } SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { - return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo); + return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]); } typedef struct { - sad128_internal hi; - sad128_internal lo; + sad128_internal val[2]; } sad256_internal; SIMD_INLINE sad256_internal v256_sad_u8_init() { sad256_internal t; - t.hi = v128_sad_u8_init(); - t.lo = v128_sad_u8_init(); + t.val[1] = v128_sad_u8_init(); + t.val[0] = v128_sad_u8_init(); return t; } @@ -109,24 +129,23 @@ SIMD_INLINE sad256_internal v256_sad_u8_init() { The result for more than 16 v256_sad_u8() calls is undefined. */ SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { sad256_internal t; - t.hi = v128_sad_u8(s.hi, a.hi, b.hi); - t.lo = v128_sad_u8(s.lo, a.lo, b.lo); + t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]); return t; } SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) { - return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo); + return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]); } typedef struct { - ssd128_internal hi; - ssd128_internal lo; + ssd128_internal val[2]; } ssd256_internal; SIMD_INLINE ssd256_internal v256_ssd_u8_init() { ssd256_internal t; - t.hi = v128_ssd_u8_init(); - t.lo = v128_ssd_u8_init(); + t.val[1] = v128_ssd_u8_init(); + t.val[0] = v128_ssd_u8_init(); return t; } @@ -134,85 +153,124 @@ SIMD_INLINE ssd256_internal v256_ssd_u8_init() { * v256_ssd_u8_sum(). */ SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { ssd256_internal t; - t.hi = v128_ssd_u8(s.hi, a.hi, b.hi); - t.lo = v128_ssd_u8(s.lo, a.lo, b.lo); + t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]); return t; } SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) { - return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo); + return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]); } SIMD_INLINE v256 v256_or(v256 a, v256 b) { - return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo)); + return v256_from_v128(v128_or(a.val[1], b.val[1]), + v128_or(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_xor(v256 a, v256 b) { - return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo)); + return v256_from_v128(v128_xor(a.val[1], b.val[1]), + v128_xor(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_and(v256 a, v256 b) { - return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo)); + return v256_from_v128(v128_and(a.val[1], b.val[1]), + v128_and(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_andn(v256 a, v256 b) { - return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo)); + return v256_from_v128(v128_andn(a.val[1], b.val[1]), + v128_andn(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { - return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo)); + return v256_from_v128(v128_add_8(a.val[1], b.val[1]), + v128_add_8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { - return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo)); + return v256_from_v128(v128_add_16(a.val[1], b.val[1]), + v128_add_16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { + return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]), + v128_sadd_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { + return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]), + v128_sadd_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { - return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo)); + return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]), + v128_sadd_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { - return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo)); + return v256_from_v128(v128_add_32(a.val[1], b.val[1]), + v128_add_32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { + return v256_from_v128(v128_add_64(a.val[1], b.val[1]), + v128_add_64(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_padd_u8(v256 a) { + return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0])); } SIMD_INLINE v256 v256_padd_s16(v256 a) { - return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo)); + return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0])); } SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { - return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo)); + return v256_from_v128(v128_sub_8(a.val[1], b.val[1]), + v128_sub_8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { - return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo)); + return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]), + v128_ssub_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { - return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo)); + return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]), + v128_ssub_s8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { - return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo)); + return v256_from_v128(v128_sub_16(a.val[1], b.val[1]), + v128_sub_16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { - return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo)); + return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]), + v128_ssub_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { - return v256_from_v128(v128_ssub_u16(a.hi, b.hi), v128_ssub_u16(a.lo, b.lo)); + return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]), + v128_ssub_u16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { - return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo)); + return v256_from_v128(v128_sub_32(a.val[1], b.val[1]), + v128_sub_32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { + return v256_from_v128(v128_sub_64(a.val[1], b.val[1]), + v128_sub_64(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_abs_s16(v256 a) { - return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo)); + return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0])); } SIMD_INLINE v256 v256_abs_s8(v256 a) { - return v256_from_v128(v128_abs_s8(a.hi), v128_abs_s8(a.lo)); + return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0])); } SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { @@ -223,99 +281,146 @@ SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { } SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) { - return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo)); + return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]), + v128_mullo_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) { - return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo)); + return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]), + v128_mulhi_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) { - return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo)); + return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]), + v128_mullo_s32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { - return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo)); + return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]), + v128_madd_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { - return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo)); + return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]), + v128_madd_us8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { - return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo)); + return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]), + v128_avg_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { - return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo)); + return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]), + v128_rdavg_u8(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { + return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]), + v128_rdavg_u16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { - return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo)); + return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]), + v128_avg_u16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { - return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo)); + return v256_from_v128(v128_min_u8(a.val[1], b.val[1]), + v128_min_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { - return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo)); + return v256_from_v128(v128_max_u8(a.val[1], b.val[1]), + v128_max_u8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { - return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo)); + return v256_from_v128(v128_min_s8(a.val[1], b.val[1]), + v128_min_s8(a.val[0], b.val[0])); +} + +SIMD_INLINE uint32_t v256_movemask_8(v256 a) { + return (v128_movemask_8(v256_high_v128(a)) << 16) | + v128_movemask_8(v256_low_v128(a)); +} + +SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { + return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]), + v128_blend_8(a.val[0], b.val[0], c.val[0])); } SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { - return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo)); + return v256_from_v128(v128_max_s8(a.val[1], b.val[1]), + v128_max_s8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { - return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo)); + return v256_from_v128(v128_min_s16(a.val[1], b.val[1]), + v128_min_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { - return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo)); + return v256_from_v128(v128_max_s16(a.val[1], b.val[1]), + v128_max_s16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { + return v256_from_v128(v128_min_s32(a.val[1], b.val[1]), + v128_min_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { + return v256_from_v128(v128_max_s32(a.val[1], b.val[1]), + v128_max_s32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo)); + return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]), + v128_ziplo_8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi)); + return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]), + v128_ziplo_8(a.val[1], b.val[1])); } SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo)); + return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]), + v128_ziplo_16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi)); + return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]), + v128_ziplo_16(a.val[1], b.val[1])); } SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo)); + return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]), + v128_ziplo_32(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi)); + return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]), + v128_ziplo_32(a.val[1], b.val[1])); } SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo)); + return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]), + v128_ziplo_64(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi)); + return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]), + v128_ziplo_64(a.val[1], b.val[1])); } SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { - return v256_from_v128(a.lo, b.lo); + return v256_from_v128(a.val[0], b.val[0]); } SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) { - return v256_from_v128(a.hi, b.hi); + return v256_from_v128(a.val[1], b.val[1]); } SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { @@ -331,31 +436,59 @@ SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { } SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { - return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo)); + return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]), + v128_unziplo_8(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { - return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo)); + return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]), + v128_unziphi_8(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { - return v256_from_v128(v128_unziplo_16(a.hi, a.lo), - v128_unziplo_16(b.hi, b.lo)); + return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]), + v128_unziplo_16(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { - return v256_from_v128(v128_unziphi_16(a.hi, a.lo), - v128_unziphi_16(b.hi, b.lo)); + return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]), + v128_unziphi_16(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { - return v256_from_v128(v128_unziplo_32(a.hi, a.lo), - v128_unziplo_32(b.hi, b.lo)); + return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]), + v128_unziplo_32(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { - return v256_from_v128(v128_unziphi_32(a.hi, a.lo), - v128_unziphi_32(b.hi, b.lo)); + return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]), + v128_unziphi_32(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { +#if HAVE_SSE2 + return v256_from_v128( + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), + _mm_castsi128_pd(a.val[1]), 0)), + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), + _mm_castsi128_pd(b.val[1]), 0))); +#else + return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]), + v128_low_v64(b.val[1]), v128_low_v64(b.val[0])); +#endif +} + +SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { +#if HAVE_SSE2 + return v256_from_v128( + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]), + _mm_castsi128_pd(a.val[1]), 3)), + _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]), + _mm_castsi128_pd(b.val[1]), 3))); +#else + return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]), + v128_high_v64(b.val[1]), v128_high_v64(b.val[0])); +#endif } SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { @@ -363,11 +496,13 @@ SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { } SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo)); + return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]), + v128_unpacklo_u8_s16(a.val[0])); } SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi)); + return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]), + v128_unpacklo_u8_s16(a.val[1])); } SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { @@ -375,26 +510,33 @@ SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { } SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_s8_s16(a.lo), v128_unpacklo_s8_s16(a.lo)); + return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]), + v128_unpacklo_s8_s16(a.val[0])); } SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_s8_s16(a.hi), v128_unpacklo_s8_s16(a.hi)); + return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]), + v128_unpacklo_s8_s16(a.val[1])); } SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { - return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo), - v128_pack_s32_s16(b.hi, b.lo)); + return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]), + v128_pack_s32_s16(b.val[1], b.val[0])); +} + +SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { + return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]), + v128_pack_s32_u16(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { - return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo), - v128_pack_s16_u8(b.hi, b.lo)); + return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]), + v128_pack_s16_u8(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { - return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo), - v128_pack_s16_s8(b.hi, b.lo)); + return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]), + v128_pack_s16_s8(b.val[1], b.val[0])); } SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { @@ -406,142 +548,326 @@ SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { } SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_u16_s32(a.lo), - v128_unpacklo_u16_s32(a.lo)); + return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]), + v128_unpacklo_u16_s32(a.val[0])); } SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_s16_s32(a.lo), - v128_unpacklo_s16_s32(a.lo)); + return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]), + v128_unpacklo_s16_s32(a.val[0])); } SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_u16_s32(a.hi), - v128_unpacklo_u16_s32(a.hi)); + return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]), + v128_unpacklo_u16_s32(a.val[1])); } SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_s16_s32(a.hi), - v128_unpacklo_s16_s32(a.hi)); -} - -SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { - v128 c16 = v128_dup_8(16); - v128 maskhi = v128_cmplt_s8(pattern.hi, c16); - v128 masklo = v128_cmplt_s8(pattern.lo, c16); - return v256_from_v128( - v128_or( - v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi), - v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)), - v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo), - v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)), - masklo))); -} - -SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { - return v256_from_v128( - v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)), - v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern))); + return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]), + v128_unpacklo_s16_s32(a.val[1])); } SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { - return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo)); + return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]), + v128_cmpgt_s8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { - return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo)); + return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]), + v128_cmplt_s8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { - return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo)); + return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]), + v128_cmpeq_8(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { - return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo)); + return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]), + v128_cmpgt_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { - return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo)); + return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]), + v128_cmplt_s16(a.val[0], b.val[0])); } SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { - return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo)); + return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]), + v128_cmpeq_16(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { + return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]), + v128_cmpgt_s32(a.val[0], b.val[0])); +} + +SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { + return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]), + v128_cmplt_s32(a.val[0], b.val[0])); } -SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { - return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c)); +SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { + return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]), + v128_cmpeq_32(a.val[0], b.val[0])); } -SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) { - return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c)); +SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) { +#if HAVE_NEON +#if defined(__aarch64__) + uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]), + vreinterpretq_u8_s64(x.val[1]) } }; + return v256_from_v128( + vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))), + vreinterpretq_s64_u8( + vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0])))); +#else + uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])), + vget_high_u8(vreinterpretq_u8_s64(x.val[0])), + vget_low_u8(vreinterpretq_u8_s64(x.val[1])), + vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } }; + return v256_from_64( + (uint64_t)vreinterpret_s64_u8( + vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))), + (uint64_t)vreinterpret_s64_u8( + vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))), + (uint64_t)vreinterpret_s64_u8( + vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))), + (uint64_t)vreinterpret_s64_u8( + vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0]))))); +#endif +#else + v128 c16 = v128_dup_8(16); + v128 maskhi = v128_cmplt_s8(pattern.val[1], c16); + v128 masklo = v128_cmplt_s8(pattern.val[0], c16); + return v256_from_v128( + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)), + v128_shuffle_8(x.val[0], pattern.val[1]), maskhi), + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)), + v128_shuffle_8(x.val[0], pattern.val[0]), masklo)); +#endif } -SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) { - return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c)); +SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) { +#if HAVE_NEON +#if defined(__aarch64__) + uint8x16x4_t p = { { + vreinterpretq_u8_s64(y.val[0]), + vreinterpretq_u8_s64(y.val[1]), + vreinterpretq_u8_s64(x.val[0]), + vreinterpretq_u8_s64(x.val[1]), + } }; + return v256_from_v128( + vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))), + vreinterpretq_s64_u8( + vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0])))); +#else + v256 c32 = v256_dup_8(32); + v256 p32 = v256_sub_8(pattern, c32); + uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])), + vget_high_u8(vreinterpretq_u8_s64(x.val[0])), + vget_low_u8(vreinterpretq_u8_s64(x.val[1])), + vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } }; + uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])), + vget_high_u8(vreinterpretq_u8_s64(y.val[0])), + vget_low_u8(vreinterpretq_u8_s64(y.val[1])), + vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } }; + v256 r1 = + v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8( + p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + p, vreinterpret_u8_s64(vget_low_s64(p32.val[0]))))); + v256 r2 = + v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8( + q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))), + (uint64_t)vreinterpret_s64_u8(vtbl4_u8( + q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0]))))); + return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32)); +#endif +#else + v128 c16 = v128_dup_8(16); + v128 c32 = v128_dup_8(32); + v128 c48 = v128_dup_8(48); + v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]); + v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]); + v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]); + v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]); + v256 r1 = v256_from_v128( + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)), + v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)), + maskhi48), + v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)), + v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)), + masklo48)); + v256 r2 = v256_from_v128( + v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)), + v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16), + v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)), + v128_shuffle_8(y.val[0], pattern.val[0]), masklo16)); + return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern)); +#endif +} + +SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { + return v256_from_v128( + v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)), + v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern))); } -SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) { - return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c)); +SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c)); } -SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) { - return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c)); +SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c)); } -SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) { - return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c)); +SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c)); } -SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) { - return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c)); +SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c)); } -SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { - return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c)); +SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c)); } -SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { - return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c)); +SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) { + return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c)); +} + +SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) { + return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c)); } /* These intrinsics require immediate values, so we must use #defines to enforce that. */ -#define v256_shl_n_byte(a, n) \ - ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n), \ - v128_shr_n_byte(a.lo, (16 - (n)) & 31)), \ - v128_shl_n_byte(a.lo, (n))) \ - : v256_from_v128( \ - (n) > 16 ? v128_shl_n_byte(a.lo, ((n)-16) & 31) : a.lo, \ +#define v256_shl_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n), \ + v128_shr_n_byte(a.val[0], 16 - (n))), \ + v128_shl_n_byte(a.val[0], (n))) \ + : v256_from_v128( \ + (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \ v128_zero())) -#define v256_shr_n_byte(a, n) \ - ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n), \ - v128_or(v128_shr_n_byte(a.lo, n), \ - v128_shl_n_byte(a.hi, (16 - (n)) & 31))) \ - : v256_from_v128( \ - v128_zero(), \ - (n) > 16 ? v128_shr_n_byte(a.hi, ((n)-16) & 31) : a.hi)) +#define v256_shr_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \ + v128_or(v128_shr_n_byte(a.val[0], n), \ + v128_shl_n_byte(a.val[1], 16 - (n)))) \ + : v256_from_v128( \ + v128_zero(), \ + (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])) #define v256_align(a, b, c) \ ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b) #define v256_shl_n_8(a, n) \ - v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n)) + v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n)) #define v256_shl_n_16(a, n) \ - v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n)) + v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n)) #define v256_shl_n_32(a, n) \ - v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n)) + v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n)) +#define v256_shl_n_64(a, n) \ + v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n)) #define v256_shr_n_u8(a, n) \ - v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n)) + v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n)) #define v256_shr_n_u16(a, n) \ - v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n)) + v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n)) #define v256_shr_n_u32(a, n) \ - v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n)) + v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n)) +#define v256_shr_n_u64(a, n) \ + v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n)) #define v256_shr_n_s8(a, n) \ - v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n)) + v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n)) #define v256_shr_n_s16(a, n) \ - v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n)) + v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n)) #define v256_shr_n_s32(a, n) \ - v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n)) + v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n)) +#define v256_shr_n_s64(a, n) \ + v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n)) + +#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) +#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) + +typedef struct { + sad128_internal_u16 val[2]; +} sad256_internal_u16; + +SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { + sad256_internal_u16 t; + t.val[1] = v128_sad_u16_init(); + t.val[0] = v128_sad_u16_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + v256_sad_u16_sum(). + The result for more than 16 v256_sad_u16() calls is undefined. */ +SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, + v256 b) { + sad256_internal_u16 t; + t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]); + return t; +} + +SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { + return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]); +} + +typedef struct { + ssd128_internal_s16 val[2]; +} ssd256_internal_s16; + +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { + ssd256_internal_s16 t; + t.val[1] = v128_ssd_s16_init(); + t.val[0] = v128_ssd_s16_init(); + return t; +} + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_s16_sum(). */ +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, + v256 b) { + ssd256_internal_s16 t; + t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]); + t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]); + return t; +} + +SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { + return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]); +} #endif /* _V256_INTRINSICS_V128_H */ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h index b82daab68..05f205169 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h @@ -14,7 +14,7 @@ #if !defined(__AVX2__) -#include "./v256_intrinsics_v128.h" +#include "aom_dsp/simd/v256_intrinsics_v128.h" #else @@ -26,7 +26,8 @@ #endif #include -#include "./v128_intrinsics_x86.h" + +#include "aom_dsp/simd/v128_intrinsics_x86.h" typedef __m256i v256; @@ -38,9 +39,9 @@ SIMD_INLINE v64 v256_low_v64(v256 a) { return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero()); } -SIMD_INLINE v128 v256_low_v128(v256 a) { - return _mm256_extracti128_si256(a, 0); -} +SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); } + +SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); } SIMD_INLINE v128 v256_high_v128(v256 a) { return _mm256_extracti128_si256(a, 1); @@ -48,8 +49,7 @@ SIMD_INLINE v128 v256_high_v128(v256 a) { SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) { // gcc seems to be missing _mm256_set_m128i() - return _mm256_insertf128_si256( - _mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1); + return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1); } SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) { @@ -84,16 +84,28 @@ SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); } SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); } +SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); } + SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); } SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); } +SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); } + +SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); } + SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return _mm256_adds_epi16(a, b); } SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); } +SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); } + +SIMD_INLINE v256 v256_padd_u8(v256 a) { + return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1)); +} + SIMD_INLINE v256 v256_padd_s16(v256 a) { return _mm256_madd_epi16(a, _mm256_set1_epi16(1)); } @@ -116,6 +128,8 @@ SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); } +SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); } + SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); } SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); } @@ -125,43 +139,51 @@ SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); } // unpack/pack intrinsics operate on the 256 bit input vector as 2 // independent 128 bit vectors. SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)), - v128_ziplo_8(v256_low_v128(a), v256_low_v128(b))); + return _mm256_unpacklo_epi8( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)), - v128_ziplo_8(v256_high_v128(a), v256_high_v128(b))); + return _mm256_unpackhi_epi8( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)), - v128_ziplo_16(v256_low_v128(a), v256_low_v128(b))); + return _mm256_unpacklo_epi16( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)), - v128_ziplo_16(v256_high_v128(a), v256_high_v128(b))); + return _mm256_unpackhi_epi16( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)), - v128_ziplo_32(v256_low_v128(a), v256_low_v128(b))); + return _mm256_unpacklo_epi32( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)), - v128_ziplo_32(v256_high_v128(a), v256_high_v128(b))); + return _mm256_unpackhi_epi32( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)), - v128_ziplo_64(v256_low_v128(a), v256_low_v128(b))); + return _mm256_unpacklo_epi64( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { - return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)), - v128_ziplo_64(v256_high_v128(a), v256_high_v128(b))); + return _mm256_unpackhi_epi64( + _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))); } SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) { @@ -184,34 +206,54 @@ SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b)); } +SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)), + _MM_SHUFFLE(3, 1, 2, 0)); +} + SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) { - return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)), - v128_unziplo_8(v256_high_v128(b), v256_low_v128(b))); + return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1)); } -SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) { - return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)), - v128_unziphi_8(v256_high_v128(b), v256_low_v128(b))); +SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)), + _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) { - return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)), - v128_unziplo_16(v256_high_v128(b), v256_low_v128(b))); + return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2)); } -SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) { - return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)), - v128_unziphi_16(v256_high_v128(b), v256_low_v128(b))); +SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), + _mm256_castsi256_ps(a), + _MM_SHUFFLE(3, 1, 3, 1))), + _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) { - return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)), - v128_unziplo_32(v256_high_v128(b), v256_low_v128(b))); + return _mm256_permute4x64_epi64( + _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), + _mm256_castsi256_ps(a), + _MM_SHUFFLE(2, 0, 2, 0))), + _MM_SHUFFLE(3, 1, 2, 0)); } -SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) { - return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)), - v128_unziphi_32(v256_high_v128(b), v256_low_v128(b))); +SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b), + _mm256_castsi256_pd(a), 15)), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) { + return _mm256_permute4x64_epi64( + _mm256_castpd_si256( + _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)), + _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { @@ -219,13 +261,15 @@ SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { } SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)), - v128_unpacklo_u8_s16(v256_low_v128(a))); + return _mm256_unpacklo_epi8( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); } SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)), - v128_unpacklo_u8_s16(v256_high_v128(a))); + return _mm256_unpackhi_epi8( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); } SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { @@ -233,28 +277,37 @@ SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { } SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_s8_s16(v256_low_v128(a)), - v128_unpacklo_s8_s16(v256_low_v128(a))); + return _mm256_srai_epi16( + _mm256_unpacklo_epi8( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 8); } SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) { - return v256_from_v128(v128_unpackhi_s8_s16(v256_high_v128(a)), - v128_unpacklo_s8_s16(v256_high_v128(a))); + return _mm256_srai_epi16( + _mm256_unpackhi_epi8( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 8); } SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) { - return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)), - v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b))); + return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); +} + +SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) { + return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) { - return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)), - v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b))); + return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) { - return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)), - v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b))); + return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a), + _MM_SHUFFLE(3, 1, 2, 0)); } SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) { @@ -266,43 +319,73 @@ SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) { } SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)), - v128_unpacklo_u16_s32(v256_low_v128(a))); + return _mm256_unpacklo_epi16( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); } SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)), - v128_unpacklo_s16_s32(v256_low_v128(a))); + return _mm256_srai_epi32( + _mm256_unpacklo_epi16( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 16); } SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)), - v128_unpacklo_u16_s32(v256_high_v128(a))); + return _mm256_unpackhi_epi16( + _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)), + _mm256_setzero_si256()); } SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) { - return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)), - v128_unpacklo_s16_s32(v256_high_v128(a))); + return _mm256_srai_epi32( + _mm256_unpackhi_epi16( + a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))), + 16); } + SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) { - v128 c16 = v128_dup_8(16); - v128 hi = v256_high_v128(pattern); - v128 lo = v256_low_v128(pattern); - v128 maskhi = v128_cmplt_s8(hi, c16); - v128 masklo = v128_cmplt_s8(lo, c16); - return v256_from_v128( - v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi), - v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)), - maskhi)), - v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo), - v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)), - masklo))); + return _mm256_blendv_epi8( + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern), + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern), + _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); +} + +SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) { + v256 c32 = v256_dup_8(32); + v256 p32 = v256_sub_8(pattern, c32); + v256 r1 = _mm256_blendv_epi8( + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32), + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32), + _mm256_cmpgt_epi8(v256_dup_8(48), pattern)); + v256 r2 = _mm256_blendv_epi8( + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern), + _mm256_shuffle_epi8( + _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern), + _mm256_cmpgt_epi8(v256_dup_8(16), pattern)); + return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern)); } SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { return _mm256_shuffle_epi8(a, pattern); } +SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) { + v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b)); + v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b)); + t1 = _mm256_add_epi32(t1, t2); + v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0), + _mm256_extracti128_si256(t1, 1)); + t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); + t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); + return (int32_t)v128_low_u32(t); +} + SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { v256 r = _mm256_madd_epi16(a, b); #if defined(__x86_64__) @@ -326,6 +409,29 @@ SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) { #endif } +SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) { + v256 r = _mm256_mullo_epi32(a, b); +#if defined(__x86_64__) + v128 t; + r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)), + _mm256_cvtepi32_epi64(v256_low_v128(r))); + t = v256_low_v128(_mm256_add_epi64( + r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1)))); + return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8))); +#else + v128 l = v256_low_v128(r); + v128 h = v256_high_v128(r); + return (int64_t)_mm_cvtsi128_si32(l) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) + + (int64_t)_mm_cvtsi128_si32(h) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) + + (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12)); +#endif +} + SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256()); v128 lo = v256_low_v128(t); @@ -341,7 +447,7 @@ SIMD_INLINE sad256_internal v256_sad_u8_init() { } /* Implementation dependent return value. Result must be finalised with - v256_sad_sum(). + v256_sad_u8_sum(). The result for more than 32 v256_sad_u8() calls is undefined. */ SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) { return _mm256_add_epi64(s, _mm256_sad_epu8(a, b)); @@ -359,7 +465,7 @@ SIMD_INLINE ssd256_internal v256_ssd_u8_init() { } /* Implementation dependent return value. Result must be finalised with - * v256_ssd_sum(). */ + * v256_ssd_u8_sum(). */ SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) { v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()), _mm256_unpacklo_epi8(b, _mm256_setzero_si256())); @@ -425,6 +531,12 @@ SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1))); } +SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) { + return _mm256_sub_epi16( + _mm256_avg_epu16(a, b), + _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1))); +} + SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); } SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); } @@ -433,18 +545,28 @@ SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); } SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); } +SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); } + +SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) { + return _mm256_blendv_epi8(a, b, c); +} + SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); } SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); } SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); } +SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); } + +SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); } + SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return _mm256_cmpgt_epi8(a, b); } SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { - return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a)); + return _mm256_cmpgt_epi8(b, a); } SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { @@ -456,13 +578,25 @@ SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) { } SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) { - return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a)); + return _mm256_cmpgt_epi16(b, a); } SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return _mm256_cmpeq_epi16(a, b); } +SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) { + return _mm256_cmpgt_epi32(a, b); +} + +SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) { + return _mm256_cmpgt_epi32(b, a); +} + +SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { + return _mm256_cmpeq_epi32(a, b); +} + SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) { return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)), _mm256_sll_epi16(a, _mm_cvtsi32_si128(c))); @@ -503,27 +637,42 @@ SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c)); } +SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { + return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { + return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c)); +} + +SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { +#if defined(__AVX512F__) + return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c)); +#else + return v256_from_v128(v128_shr_s64(v256_high_v128(a), c), + v128_shr_s64(v256_low_v128(a), c)); +#endif +} + /* These intrinsics require immediate values, so we must use #defines to enforce that. */ // _mm256_slli_si256 works on 128 bit lanes and can't be used -#define v256_shl_n_byte(a, n) \ - ((n) < 16 \ - ? v256_from_v128(v128_or(v128_shl_n_byte(v256_high_v128(a), n), \ - v128_shr_n_byte(v256_low_v128(a), 16 - (n))), \ - v128_shl_n_byte(v256_low_v128(a), n)) \ - : v256_from_v128(v128_shl_n_byte(v256_low_v128(a), (n)-16), \ - v128_zero())) +#define v256_shl_n_byte(a, n) \ + ((n) < 16 ? v256_from_v128( \ + v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \ + v128_shl_n_byte(v256_low_v128(a), n)) \ + : _mm256_inserti128_si256( \ + _mm256_setzero_si256(), \ + v128_shl_n_byte(v256_low_v128(a), (n)-16), 1)) // _mm256_srli_si256 works on 128 bit lanes and can't be used -#define v256_shr_n_byte(a, n) \ - ((n) < 16 \ - ? _mm256_alignr_epi8( \ - _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \ - : ((n) > 16 \ - ? _mm256_srli_si256( \ - _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), \ - (n)-16) \ - : _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)))) +#define v256_shr_n_byte(a, n) \ + ((n) < 16 \ + ? _mm256_alignr_epi8( \ + _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \ + : _mm256_inserti128_si256( \ + _mm256_setzero_si256(), \ + v128_align(v256_high_v128(a), v256_high_v128(a), n), 0)) // _mm256_alignr_epi8 works on two 128 bit lanes and can't be used #define v256_align(a, b, c) \ @@ -543,6 +692,59 @@ SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { #define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c) #define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c) #define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c) +#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c) +#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c) +#define v256_shr_n_s64(a, c) \ + v256_shr_s64((a), (c)) // _mm256_srai_epi64 broken in gcc? +#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n)) +#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n)) + +typedef v256 sad256_internal_u16; + +SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v256_sad_u16_sum(). */ +SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a, + v256 b) { +#if defined(__SSE4_1__) + v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b)); +#else + v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)), + v256_xor(b, v256_dup_16(32768))); + t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)), + v256_or(v256_and(a, t), v256_andn(b, t))); +#endif + return v256_add_32( + s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t))); +} + +SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) { + v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s)); + return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) + + v128_low_u32(v128_shr_n_byte(t, 8)) + + v128_low_u32(v128_shr_n_byte(t, 12)); +} + +typedef v256 ssd256_internal_s16; + +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); } + +/* Implementation dependent return value. Result must be finalised with + * v256_ssd_s16_sum(). */ +SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a, + v256 b) { + v256 d = v256_sub_16(a, b); + d = v256_madd_s16(d, d); + return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()), + _mm256_unpacklo_epi32(d, v256_zero()))); +} + +SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { + v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s)); + return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t)); +} + #endif #endif /* _V256_INTRINSICS_H */ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h index 5c0042d8c..6ce53c6a9 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h @@ -14,7 +14,8 @@ #include #include -#include "./v64_intrinsics_c.h" + +#include "aom_dsp/simd/v64_intrinsics_c.h" /* Fallback to plain, unoptimised C. */ @@ -71,6 +72,8 @@ SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); } SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); } SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); } +SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); } +SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); } SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); } SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); } SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); } @@ -100,6 +103,9 @@ SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); } SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { return c_v64_pack_s32_s16(a, b); } +SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { + return c_v64_pack_s32_u16(a, b); +} SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { return c_v64_pack_s16_u8(a, b); } @@ -156,6 +162,7 @@ SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); } SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); } SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); } +SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); } SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); } SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); } SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); } diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h index c7574eef5..267441b02 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h @@ -13,7 +13,8 @@ #define _V64_INTRINSICS_H #include -#include "./v64_intrinsics_arm.h" + +#include "aom_dsp/simd/v64_intrinsics_arm.h" #include "aom_ports/arm.h" #ifdef AOM_INCOMPATIBLE_GCC @@ -121,20 +122,34 @@ SIMD_INLINE v64 v64_dup_32(uint32_t x) { } SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) { - int64x2_t r = vpaddlq_s32(vpaddlq_s16( + int16x8_t t = vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))))); + vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))); +#if defined(__aarch64__) + return vaddlvq_s16(t); +#else + int64x2_t r = vpaddlq_s32(vpaddlq_s16(t)); return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r)); +#endif } SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) { +#if defined(__aarch64__) + return vaddlvq_s32( + vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); +#else int64x2_t r = vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); return (int64_t)(vget_high_s64(r) + vget_low_s64(r)); +#endif } SIMD_INLINE uint64_t v64_hadd_u8(v64 x) { +#if defined(__aarch64__) + return vaddlv_u8(vreinterpret_u8_s64(x)); +#else return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))); +#endif } SIMD_INLINE int64_t v64_hadd_s16(v64 a) { @@ -145,34 +160,40 @@ typedef uint16x8_t sad64_internal; SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); } -/* Implementation dependent return value. Result must be finalised with - v64_sad_u8_sum(). - The result for more than 32 v64_sad_u8() calls is undefined. */ +// Implementation dependent return value. Result must be finalised with +// v64_sad_u8_sum(). SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); } SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { +#if defined(__aarch64__) + return vaddlvq_u16(s); +#else uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s)); return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r)); +#endif } -typedef int64x1_t ssd64_internal; +typedef uint32x4_t ssd64_internal; -SIMD_INLINE ssd64_internal v64_ssd_u8_init() { - return (ssd64_internal)(uint64_t)0; -} +SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); } -/* Implementation dependent return value. Result must be finalised with - * v64_ssd_u8_sum(). */ +// Implementation dependent return value. Result must be finalised with +// v64_ssd_u8_sum(). SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); - uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t))); - return vadd_u64(s, vadd_u64(vget_high_u64(r), vget_low_u64(r))); + return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t))); } SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { - return (uint32_t)(uint64_t)s; +#if defined(__aarch64__) + return vaddvq_u32(s); +#else + uint64x2_t t = vpaddlq_u32(s); + return vget_lane_u32( + vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); +#endif } SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); } @@ -188,6 +209,16 @@ SIMD_INLINE v64 v64_add_8(v64 x, v64 y) { vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); } +SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) { + return vreinterpret_s64_u8( + vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); +} + +SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) { + return vreinterpret_s64_s8( + vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); +} + SIMD_INLINE v64 v64_add_16(v64 x, v64 y) { return vreinterpret_s64_s16( vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); @@ -252,8 +283,14 @@ SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) { } SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) { +#if defined(__aarch64__) + int16x8_t t = vreinterpretq_s16_s32( + vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); + return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t))); +#else return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32( vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16))); +#endif } SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) { @@ -269,10 +306,10 @@ SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) { } SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) { - return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16( - vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)), - vreinterpret_s8_s64(y)), - vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7))))); + int16x8_t t = + vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))), + vmovl_s8(vreinterpret_s8_s64(y))); + return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t))); } SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) { @@ -285,6 +322,11 @@ SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) { vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); } +SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) { + return vreinterpret_s64_u16( + vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); +} + SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) { return vreinterpret_s64_u16( vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); @@ -321,33 +363,63 @@ SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) { } SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); return vreinterpret_s64_u8(r.val[0]); +#endif } SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); return vreinterpret_s64_u8(r.val[1]); +#endif } SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); return vreinterpret_s64_s16(r.val[0]); +#endif } SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); return vreinterpret_s64_s16(r.val[1]); +#endif } SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u32( + vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); +#else int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); return vreinterpret_s64_s32(r.val[0]); +#endif } SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u32( + vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); +#else int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); return vreinterpret_s64_s32(r.val[1]); +#endif } SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { @@ -371,6 +443,11 @@ SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) { vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); } +SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) { + return vreinterpret_s64_u16(vqmovun_s32( + vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); +} + SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) { return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32( vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); @@ -382,23 +459,43 @@ SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) { } SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); return vreinterpret_s64_u8(r.val[0]); +#endif } SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u8( + vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); +#else uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); return vreinterpret_s64_u8(r.val[1]); +#endif } SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); return vreinterpret_s64_u16(r.val[0]); +#endif } SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) { +#if defined(__aarch64__) + return vreinterpret_s64_u16( + vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); +#else uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); return vreinterpret_s64_u16(r.val[1]); +#endif } SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) { diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h index 5032238b6..8158899cb 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h @@ -17,7 +17,8 @@ #include #include -#include "./aom_config.h" + +#include "config/aom_config.h" typedef union { uint8_t u8[8]; @@ -30,13 +31,17 @@ typedef union { int64_t s64; } c_v64; -SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; } +SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { + return a.u32[!!CONFIG_BIG_ENDIAN]; +} SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) { return a.u32[!CONFIG_BIG_ENDIAN]; } -SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; } +SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { + return a.s32[!!CONFIG_BIG_ENDIAN]; +} SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) { return a.s32[!CONFIG_BIG_ENDIAN]; @@ -45,7 +50,7 @@ SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) { SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) { c_v64 t; t.u32[!CONFIG_BIG_ENDIAN] = x; - t.u32[CONFIG_BIG_ENDIAN] = y; + t.u32[!!CONFIG_BIG_ENDIAN] = y; return t; } @@ -177,6 +182,30 @@ SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) { return t; } +SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) + t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255 + ? 255 + : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0 + ? 0 + : (int16_t)a.u8[c] + (int16_t)b.u8[c]; + return t; +} + +SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 8; c++) + t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127 + ? 127 + : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128 + ? -128 + : (int16_t)a.s8[c] + (int16_t)b.s8[c]; + return t; +} + SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) { c_v64 t; int c; @@ -206,8 +235,7 @@ SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) { SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) { c_v64 t; int c; - for (c = 0; c < 8; c++) - t.u8[c] = (int32_t)a.u8[c] - (int32_t)b.u8[c] < 0 ? 0 : a.u8[c] - b.u8[c]; + for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c]; return t; } @@ -459,6 +487,20 @@ SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) { return t; } +SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) { + c_v64 t; + if (CONFIG_BIG_ENDIAN) { + c_v64 u = a; + a = b; + b = u; + } + t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1]; + t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0]; + t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1]; + t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0]; + return t; +} + SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) { c_v64 t; if (CONFIG_BIG_ENDIAN) { @@ -670,6 +712,13 @@ SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) { return t; } +SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) { + c_v64 t; + int c; + for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1; + return t; +} + SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) { c_v64 t; int c; diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h index 8dcc9f6fc..130052ee1 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h @@ -90,8 +90,7 @@ SIMD_INLINE void v64_store_unaligned(void *p, v64 a) { _mm_storel_epi64((__m128i *)p, a); } -// The following function requires an immediate. -#if defined(__OPTIMIZE__) && __OPTIMIZE__ +#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) #define v64_align(a, b, c) \ ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b) #else @@ -112,6 +111,10 @@ SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); } SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); } +SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); } + +SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); } + SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); } SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); } @@ -170,6 +173,22 @@ SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) { return _mm_packs_epi32(t, t); } +SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) { +#if defined(__SSE4_1__) + __m128i t = _mm_unpacklo_epi64(b, a); + return _mm_packus_epi32(t, t); +#else + int32_t ah = v64_high_u32(a); + int32_t al = v64_low_u32(a); + int32_t bh = v64_high_u32(b); + int32_t bl = v64_low_u32(b); + return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah, + al > 65535 ? 65535 : al < 0 ? 0 : al, + bh > 65535 ? 65535 : bh < 0 ? 0 : bh, + bl > 65535 ? 65535 : bl < 0 ? 0 : bl); +#endif +} + SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) { __m128i t = _mm_unpacklo_epi64(b, a); return _mm_packus_epi16(t, t); @@ -272,14 +291,11 @@ SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { } SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { - __m128i r, r1, r2, z; - z = _mm_setzero_si128(); - r1 = _mm_madd_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(a, z), 8), - _mm_unpacklo_epi8(b, z)); - r2 = _mm_srli_si128(r1, 8); - r = _mm_add_epi32(r1, r2); - r = _mm_add_epi32(r, _mm_srli_si128(r, 4)); - return ((int32_t)v64_low_u32(r)) >> 8; + __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), + _mm_unpacklo_epi8(b, _mm_setzero_si128())); + t = _mm_add_epi32(t, _mm_srli_si128(t, 8)); + t = _mm_add_epi32(t, _mm_srli_si128(t, 4)); + return (int32_t)v64_low_u32(t); } SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { @@ -371,6 +387,11 @@ SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1))); } +SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { + return _mm_sub_epi16(_mm_avg_epu16(a, b), + _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1))); +} + SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); } SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); } diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c index 6ae378ff2..6ce3d7acb 100644 --- a/third_party/aom/aom_dsp/ssim.c +++ b/third_party/aom/aom_dsp/ssim.c @@ -11,7 +11,9 @@ #include #include -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/ssim.h" #include "aom_ports/mem.h" #include "aom_ports/system_state.h" @@ -31,6 +33,7 @@ void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp, } } } + void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr) { @@ -46,7 +49,6 @@ void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp, } } -#if CONFIG_HIGHBITDEPTH void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, @@ -62,7 +64,6 @@ void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r, } } } -#endif // CONFIG_HIGHBITDEPTH static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 @@ -108,7 +109,6 @@ static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) { return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8); } -#if CONFIG_HIGHBITDEPTH static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t bd, uint32_t shift) { uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; @@ -117,7 +117,6 @@ static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r, return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift), sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd); } -#endif // CONFIG_HIGHBITDEPTH // We are using a 8x8 moving window with starting location of each 8x8 window // on the 4x4 pixel grid. Such arrangement allows the windows to overlap @@ -142,7 +141,6 @@ static double aom_ssim2(const uint8_t *img1, const uint8_t *img2, return ssim_total; } -#if CONFIG_HIGHBITDEPTH static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, int stride_img2, int width, int height, uint32_t bd, uint32_t shift) { @@ -164,7 +162,6 @@ static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, ssim_total /= samples; return ssim_total; } -#endif // CONFIG_HIGHBITDEPTH double aom_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight) { @@ -422,7 +419,6 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, return inconsistency_total; } -#if CONFIG_HIGHBITDEPTH double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight, uint32_t bd, uint32_t in_bd) { @@ -441,4 +437,3 @@ double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, *weight = 1; return abc[0] * .8 + .1 * (abc[1] + abc[2]); } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h index 902735e50..c8a389dfe 100644 --- a/third_party/aom/aom_dsp/ssim.h +++ b/third_party/aom/aom_dsp/ssim.h @@ -18,7 +18,8 @@ extern "C" { #endif -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom_scale/yv12config.h" // metrics used for calculating ssim, ssim2, dssim, and ssimc @@ -75,11 +76,9 @@ double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source, double *ssim_u, double *ssim_v, uint32_t bd, uint32_t in_bd); -#if CONFIG_HIGHBITDEPTH double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, const YV12_BUFFER_CONFIG *dest, double *weight, uint32_t bd, uint32_t in_bd); -#endif // CONFIG_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c index 8dda96efb..2f6da96e5 100644 --- a/third_party/aom/aom_dsp/subtract.c +++ b/third_party/aom/aom_dsp/subtract.c @@ -11,8 +11,8 @@ #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" @@ -32,7 +32,6 @@ void aom_subtract_block_c(int rows, int cols, int16_t *diff, } } -#if CONFIG_HIGHBITDEPTH void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8, @@ -52,4 +51,3 @@ void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, src += src_stride; } } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c index b9155fdc0..44ec41f2e 100644 --- a/third_party/aom/aom_dsp/sum_squares.c +++ b/third_party/aom/aom_dsp/sum_squares.c @@ -11,7 +11,7 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width, int height) { diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h index ef9e9bc98..7deb0aea3 100644 --- a/third_party/aom/aom_dsp/txfm_common.h +++ b/third_party/aom/aom_dsp/txfm_common.h @@ -28,25 +28,11 @@ typedef struct txfm_param { TX_SIZE tx_size; int lossless; int bd; -#if CONFIG_MRC_TX || CONFIG_LGT - int is_inter; -#endif // CONFIG_MRC_TX || CONFIG_LGT -#if CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED - int stride; - uint8_t *dst; -#if CONFIG_MRC_TX - int *valid_mask; - uint8_t *mask; -#endif // CONFIG_MRC_TX -#if CONFIG_LGT_FROM_PRED - int mode; - int use_lgt; -#endif // CONFIG_LGT_FROM_PRED -#endif // CONFIG_MRC_TX || CONFIG_LGT_FROM_PRED -// for inverse transforms only -#if CONFIG_ADAPT_SCAN - const int16_t *eob_threshold; -#endif + // are the pixel buffers octets or shorts? This should collapse to + // bd==8 implies !is_hbd, but that's not certain right now. + int is_hbd; + TxSetType tx_set_type; + // for inverse transforms only int eob; } TxfmParam; @@ -102,647 +88,4 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) { return rv; } -#if CONFIG_LGT_FROM_PRED -// Use negative numbers so they do not coincide with lgt*[0][0], which are -// always nonnegative. -typedef enum { - DCT4 = -1, - ADST4 = -2, - DCT8 = -3, - ADST8 = -4, - DCT16 = -5, - ADST16 = -6, - DCT32 = -7, - ADST32 = -8, -} ButterflyLgt; - -/* These are some LGTs already implementated in the codec. When any of them - * is chosen, the flgt or ilgt function will call the existing fast - * transform instead of the matrix product implementation. Thus, we - * do not need the actual basis functions here */ -static const tran_high_t lgt4_000[1][1] = { { (tran_high_t)DCT4 } }; -static const tran_high_t lgt4_100[1][1] = { { (tran_high_t)ADST4 } }; -static const tran_high_t lgt8_000[1][1] = { { (tran_high_t)DCT8 } }; -static const tran_high_t lgt8_200[1][1] = { { (tran_high_t)ADST8 } }; -static const tran_high_t lgt16_000[1][1] = { { (tran_high_t)DCT16 } }; -static const tran_high_t lgt16_200[1][1] = { { (tran_high_t)ADST16 } }; -static const tran_high_t lgt32_000[1][1] = { { (tran_high_t)DCT32 } }; -static const tran_high_t lgt32_200[1][1] = { { (tran_high_t)ADST32 } }; - -/* The Line Graph Transforms (LGTs) matrices are written as follows. - Each 2D array is sqrt(2)*16384 times an LGT matrix, which is the - matrix of eigenvectors of the graph Laplacian matrix of the associated - line graph. Some of those transforms have fast algorithms but not - implemented yet for now. */ - -// LGT4 name: lgt4_150_000w3 -// Self loops: 1.500, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 0.000 -static const tran_high_t lgt4_150_000w3[4][4] = { - { 0, 0, 0, 23170 }, - { 5991, 13537, 17825, 0 }, - { 15515, 10788, -13408, 0 }, - { 16133, -15403, 6275, 0 }, -}; - -// LGT4 name: lgt4_100_000w3 -// Self loops: 1.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 0.000 -static const tran_high_t lgt4_100_000w3[4][4] = { - { 0, 0, 0, 23170 }, - { 7600, 13694, 17076, 0 }, - { 17076, 7600, -13694, 0 }, - { 13694, -17076, 7600, 0 }, -}; - -// LGT4 name: lgt4_060_000w3 -// Self loops: 0.600, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 0.000 -static const tran_high_t lgt4_060_000w3[4][4] = { - { 0, 0, 0, 23170 }, - { 9449, 13755, 16075, 0 }, - { 17547, 4740, -14370, 0 }, - { 11819, -18034, 8483, 0 }, -}; - -// LGT4 name: lgt4_000w3 -// Self loops: 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 0.000 -static const tran_high_t lgt4_000w3[4][4] = { - { 0, 0, 0, 23170 }, - { 13377, 13377, 13377, 0 }, - { 16384, 0, -16384, 0 }, - { 9459, -18919, 9459, 0 }, -}; - -// LGT4 name: lgt4_150_000w2 -// Self loops: 1.500, 0.000, 0.000, 0.000 -// Edges: 1.000, 0.000, 1.000 -static const tran_high_t lgt4_150_000w2[4][4] = { - { 10362, 20724, 0, 0 }, - { 20724, -10362, 0, 0 }, - { 0, 0, 16384, 16384 }, - { 0, 0, 16384, -16384 }, -}; - -// LGT4 name: lgt4_100_000w2 -// Self loops: 1.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 0.000, 1.000 -static const tran_high_t lgt4_100_000w2[4][4] = { - { 12181, 19710, 0, 0 }, - { 19710, -12181, 0, 0 }, - { 0, 0, 16384, 16384 }, - { 0, 0, 16384, -16384 }, -}; - -// LGT4 name: lgt4_060_000w2 -// Self loops: 0.600, 0.000, 0.000, 0.000 -// Edges: 1.000, 0.000, 1.000 -static const tran_high_t lgt4_060_000w2[4][4] = { - { 13831, 18590, 0, 0 }, - { 18590, -13831, 0, 0 }, - { 0, 0, 16384, 16384 }, - { 0, 0, 16384, -16384 }, -}; - -// LGT4 name: lgt4_000w2 -// Self loops: 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 0.000, 1.000 -static const tran_high_t lgt4_000w2[4][4] = { - { 16384, 16384, 0, 0 }, - { 16384, -16384, 0, 0 }, - { 0, 0, 16384, 16384 }, - { 0, 0, 16384, -16384 }, -}; - -// LGT4 name: lgt4_150_000w1 -// Self loops: 1.500, 0.000, 0.000, 0.000 -// Edges: 0.000, 1.000, 1.000 -static const tran_high_t lgt4_150_000w1[4][4] = { - { 23170, 0, 0, 0 }, - { 0, 13377, 13377, 13377 }, - { 0, 16384, 0, -16384 }, - { 0, 9459, -18919, 9459 }, -}; - -// LGT4 name: lgt4_100_000w1 -// Self loops: 1.000, 0.000, 0.000, 0.000 -// Edges: 0.000, 1.000, 1.000 -static const tran_high_t lgt4_100_000w1[4][4] = { - { 23170, 0, 0, 0 }, - { 0, 13377, 13377, 13377 }, - { 0, 16384, 0, -16384 }, - { 0, 9459, -18919, 9459 }, -}; - -// LGT4 name: lgt4_060_000w1 -// Self loops: 0.600, 0.000, 0.000, 0.000 -// Edges: 0.000, 1.000, 1.000 -static const tran_high_t lgt4_060_000w1[4][4] = { - { 23170, 0, 0, 0 }, - { 0, 13377, 13377, 13377 }, - { 0, 16384, 0, -16384 }, - { 0, 9459, -18919, 9459 }, -}; - -// LGT4 name: lgt4_000w1 -// Self loops: 0.000, 0.000, 0.000, 0.000 -// Edges: 0.000, 1.000, 1.000 -static const tran_high_t lgt4_000w1[4][4] = { - { 23170, 0, 0, 0 }, - { 0, 13377, 13377, 13377 }, - { 0, 16384, 0, -16384 }, - { 0, 9459, -18919, 9459 }, -}; - -// LGT4 name: lgt4_060 -// Self loops: 0.600, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000 -static const tran_high_t lgt4_060[4][4] = { - { 6971, 10504, 13060, 14400 }, - { 14939, 11211, -2040, -13559 }, - { 14096, -8258, -12561, 10593 }, - { 8150, -15253, 14295, -5784 }, -}; - -// LGT4 name: lgt4_150 -// Self loops: 1.500, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000 -static const tran_high_t lgt4_150[4][4] = { - { 3998, 9435, 13547, 15759 }, - { 11106, 15105, 1886, -13483 }, - { 15260, -1032, -14674, 9361 }, - { 12833, -14786, 11596, -4372 }, -}; - -// LGT8 name: lgt8_150_000w7 -// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 -static const tran_high_t lgt8_150_000w7[8][8] = { - { 0, 0, 0, 0, 0, 0, 0, 32768 }, - { 2522, 6185, 9551, 12461, 14775, 16381, 17204, 0 }, - { 7390, 15399, 16995, 11515, 1240, -9551, -16365, 0 }, - { 11716, 16625, 3560, -13353, -15831, -1194, 14733, 0 }, - { 15073, 8866, -14291, -10126, 13398, 11308, -12401, 0 }, - { 16848, -4177, -13724, 14441, 2923, -16628, 9513, 0 }, - { 15942, -14888, 5405, 7137, -15640, 15288, -6281, 0 }, - { 10501, -14293, 16099, -15670, 13063, -8642, 3021, 0 }, -}; - -// LGT8 name: lgt8_100_000w7 -// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 -static const tran_high_t lgt8_100_000w7[8][8] = { - { 0, 0, 0, 0, 0, 0, 0, 32768 }, - { 3518, 6883, 9946, 12575, 14654, 16093, 16829, 0 }, - { 9946, 16093, 16093, 9946, 0, -9946, -16093, 0 }, - { 14654, 14654, 0, -14654, -14654, 0, 14654, 0 }, - { 16829, 3518, -16093, -6883, 14654, 9946, -12575, 0 }, - { 16093, -9946, -9946, 16093, 0, -16093, 9946, 0 }, - { 12575, -16829, 9946, 3518, -14654, 16093, -6883, 0 }, - { 6883, -12575, 16093, -16829, 14654, -9946, 3518, 0 }, -}; - -// LGT8 name: lgt8_060_000w7 -// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 -static const tran_high_t lgt8_060_000w7[8][8] = { - { 0, 0, 0, 0, 0, 0, 0, 32768 }, - { 5087, 7951, 10521, 12701, 14411, 15587, 16186, 0 }, - { 13015, 16486, 14464, 7621, -1762, -10557, -15834, 0 }, - { 16581, 11475, -4050, -15898, -13311, 1362, 14798, 0 }, - { 16536, -1414, -16981, -3927, 15746, 8879, -12953, 0 }, - { 14104, -13151, -7102, 16932, -1912, -15914, 10385, 0 }, - { 10156, -17168, 11996, 1688, -14174, 16602, -7249, 0 }, - { 5295, -11721, 15961, -17224, 15274, -10476, 3723, 0 }, -}; - -// LGT8 name: lgt8_000w7 -// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 0.000 -static const tran_high_t lgt8_000w7[8][8] = { - { 0, 0, 0, 0, 0, 0, 0, 32768 }, - { 12385, 12385, 12385, 12385, 12385, 12385, 12385, 0 }, - { 17076, 13694, 7600, 0, -7600, -13694, -17076, 0 }, - { 15781, 3898, -10921, -17515, -10921, 3898, 15781, 0 }, - { 13694, -7600, -17076, 0, 17076, 7600, -13694, 0 }, - { 10921, -15781, -3898, 17515, -3898, -15781, 10921, 0 }, - { 7600, -17076, 13694, 0, -13694, 17076, -7600, 0 }, - { 3898, -10921, 15781, -17515, 15781, -10921, 3898, 0 }, -}; - -// LGT8 name: lgt8_150_000w6 -// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 -static const tran_high_t lgt8_150_000w6[8][8] = { - { 0, 0, 0, 0, 0, 0, 23170, 23170 }, - { 0, 0, 0, 0, 0, 0, 23170, -23170 }, - { 3157, 7688, 11723, 15002, 17312, 18506, 0, 0 }, - { 9167, 17832, 16604, 6164, -7696, -17286, 0, 0 }, - { 14236, 15584, -4969, -18539, -6055, 14938, 0, 0 }, - { 17558, 1891, -18300, 5288, 16225, -11653, 0, 0 }, - { 17776, -13562, -647, 14380, -17514, 7739, 0, 0 }, - { 12362, -16318, 17339, -15240, 10399, -3688, 0, 0 }, -}; - -// LGT8 name: lgt8_100_000w6 -// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 -static const tran_high_t lgt8_100_000w6[8][8] = { - { 0, 0, 0, 0, 0, 0, 23170, 23170 }, - { 0, 0, 0, 0, 0, 0, 23170, -23170 }, - { 4350, 8447, 12053, 14959, 16995, 18044, 0, 0 }, - { 12053, 18044, 14959, 4350, -8447, -16995, 0, 0 }, - { 16995, 12053, -8447, -18044, -4350, 14959, 0, 0 }, - { 18044, -4350, -16995, 8447, 14959, -12053, 0, 0 }, - { 14959, -16995, 4350, 12053, -18044, 8447, 0, 0 }, - { 8447, -14959, 18044, -16995, 12053, -4350, 0, 0 }, -}; - -// LGT8 name: lgt8_060_000w6 -// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 -static const tran_high_t lgt8_060_000w6[8][8] = { - { 0, 0, 0, 0, 0, 0, 23170, 23170 }, - { 0, 0, 0, 0, 0, 0, 23170, -23170 }, - { 6154, 9551, 12487, 14823, 16446, 17277, 0, 0 }, - { 15149, 17660, 12503, 1917, -9502, -16795, 0, 0 }, - { 18166, 7740, -11772, -17465, -2656, 15271, 0, 0 }, - { 16682, -8797, -15561, 10779, 14189, -12586, 0, 0 }, - { 12436, -18234, 7007, 10763, -18483, 8945, 0, 0 }, - { 6591, -14172, 18211, -17700, 12766, -4642, 0, 0 }, -}; - -// LGT8 name: lgt8_000w6 -// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 0.000, 1.000 -static const tran_high_t lgt8_000w6[8][8] = { - { 0, 0, 0, 0, 0, 0, 23170, 23170 }, - { 0, 0, 0, 0, 0, 0, 23170, -23170 }, - { 13377, 13377, 13377, 13377, 13377, 13377, 0, 0 }, - { 18274, 13377, 4896, -4896, -13377, -18274, 0, 0 }, - { 16384, 0, -16384, -16384, 0, 16384, 0, 0 }, - { 13377, -13377, -13377, 13377, 13377, -13377, 0, 0 }, - { 9459, -18919, 9459, 9459, -18919, 9459, 0, 0 }, - { 4896, -13377, 18274, -18274, 13377, -4896, 0, 0 }, -}; - -// LGT8 name: lgt8_150_000w5 -// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 -static const tran_high_t lgt8_150_000w5[8][8] = { - { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, - { 0, 0, 0, 0, 0, 23170, 0, -23170 }, - { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, - { 4109, 9895, 14774, 18299, 20146, 0, 0, 0 }, - { 11753, 20300, 13161, -4148, -18252, 0, 0, 0 }, - { 17573, 10921, -16246, -12895, 14679, 0, 0, 0 }, - { 19760, -9880, -9880, 19760, -9880, 0, 0, 0 }, - { 14815, -18624, 17909, -12844, 4658, 0, 0, 0 }, -}; - -// LGT8 name: lgt8_100_000w5 -// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 -static const tran_high_t lgt8_100_000w5[8][8] = { - { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, - { 0, 0, 0, 0, 0, 23170, 0, -23170 }, - { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, - { 5567, 10683, 14933, 17974, 19559, 0, 0, 0 }, - { 14933, 19559, 10683, -5567, -17974, 0, 0, 0 }, - { 19559, 5567, -17974, -10683, 14933, 0, 0, 0 }, - { 17974, -14933, -5567, 19559, -10683, 0, 0, 0 }, - { 10683, -17974, 19559, -14933, 5567, 0, 0, 0 }, -}; - -// LGT8 name: lgt8_060_000w5 -// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 -static const tran_high_t lgt8_060_000w5[8][8] = { - { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, - { 0, 0, 0, 0, 0, 23170, 0, -23170 }, - { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, - { 7650, 11741, 15069, 17415, 18628, 0, 0, 0 }, - { 17824, 18002, 7558, -7345, -17914, 0, 0, 0 }, - { 19547, 569, -19303, -8852, 15505, 0, 0, 0 }, - { 15592, -17548, -2862, 19625, -11374, 0, 0, 0 }, - { 8505, -17423, 20218, -15907, 6006, 0, 0, 0 }, -}; - -// LGT8 name: lgt8_000w5 -// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 0.000, 1.000, 1.000 -static const tran_high_t lgt8_000w5[8][8] = { - { 0, 0, 0, 0, 0, 18919, 18919, 18919 }, - { 0, 0, 0, 0, 0, 23170, 0, -23170 }, - { 0, 0, 0, 0, 0, 13377, -26755, 13377 }, - { 14654, 14654, 14654, 14654, 14654, 0, 0, 0 }, - { 19710, 12181, 0, -12181, -19710, 0, 0, 0 }, - { 16766, -6404, -20724, -6404, 16766, 0, 0, 0 }, - { 12181, -19710, 0, 19710, -12181, 0, 0, 0 }, - { 6404, -16766, 20724, -16766, 6404, 0, 0, 0 }, -}; - -// LGT8 name: lgt8_150_000w4 -// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_150_000w4[8][8] = { - { 5655, 13343, 19159, 22286, 0, 0, 0, 0 }, - { 15706, 21362, 2667, -19068, 0, 0, 0, 0 }, - { 21580, -1459, -20752, 13238, 0, 0, 0, 0 }, - { 18148, -20910, 16399, -6183, 0, 0, 0, 0 }, - { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, - { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, - { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, - { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, -}; - -// LGT8 name: lgt8_100_000w4 -// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_100_000w4[8][8] = { - { 7472, 14042, 18919, 21513, 0, 0, 0, 0 }, - { 18919, 18919, 0, -18919, 0, 0, 0, 0 }, - { 21513, -7472, -18919, 14042, 0, 0, 0, 0 }, - { 14042, -21513, 18919, -7472, 0, 0, 0, 0 }, - { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, - { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, - { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, - { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, -}; - -// LGT8 name: lgt8_060_000w4 -// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_060_000w4[8][8] = { - { 9858, 14855, 18470, 20365, 0, 0, 0, 0 }, - { 21127, 15855, -2886, -19175, 0, 0, 0, 0 }, - { 19935, -11679, -17764, 14980, 0, 0, 0, 0 }, - { 11525, -21570, 20217, -8180, 0, 0, 0, 0 }, - { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, - { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, - { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, - { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, -}; - -// LGT8 name: lgt8_000w4 -// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 0.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_000w4[8][8] = { - { 16384, 16384, 16384, 16384, 0, 0, 0, 0 }, - { 21407, 8867, -8867, -21407, 0, 0, 0, 0 }, - { 16384, -16384, -16384, 16384, 0, 0, 0, 0 }, - { 8867, -21407, 21407, -8867, 0, 0, 0, 0 }, - { 0, 0, 0, 0, 16384, 16384, 16384, 16384 }, - { 0, 0, 0, 0, 21407, 8867, -8867, -21407 }, - { 0, 0, 0, 0, 16384, -16384, -16384, 16384 }, - { 0, 0, 0, 0, 8867, -21407, 21407, -8867 }, -}; - -// LGT8 name: lgt8_150_000w3 -// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_150_000w3[8][8] = { - { 8473, 19144, 25209, 0, 0, 0, 0, 0 }, - { 21942, 15257, -18961, 0, 0, 0, 0, 0 }, - { 22815, -21783, 8874, 0, 0, 0, 0, 0 }, - { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, - { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, - { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, - { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, - { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, -}; - -// LGT8 name: lgt8_100_000w3 -// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_100_000w3[8][8] = { - { 10747, 19366, 24149, 0, 0, 0, 0, 0 }, - { 24149, 10747, -19366, 0, 0, 0, 0, 0 }, - { 19366, -24149, 10747, 0, 0, 0, 0, 0 }, - { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, - { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, - { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, - { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, - { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, -}; - -// LGT8 name: lgt8_060_000w3 -// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_060_000w3[8][8] = { - { 13363, 19452, 22733, 0, 0, 0, 0, 0 }, - { 24815, 6704, -20323, 0, 0, 0, 0, 0 }, - { 16715, -25503, 11997, 0, 0, 0, 0, 0 }, - { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, - { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, - { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, - { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, - { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, -}; - -// LGT8 name: lgt8_000w3 -// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 0.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_000w3[8][8] = { - { 18919, 18919, 18919, 0, 0, 0, 0, 0 }, - { 23170, 0, -23170, 0, 0, 0, 0, 0 }, - { 13377, -26755, 13377, 0, 0, 0, 0, 0 }, - { 0, 0, 0, 14654, 14654, 14654, 14654, 14654 }, - { 0, 0, 0, 19710, 12181, 0, -12181, -19710 }, - { 0, 0, 0, 16766, -6404, -20724, -6404, 16766 }, - { 0, 0, 0, 12181, -19710, 0, 19710, -12181 }, - { 0, 0, 0, 6404, -16766, 20724, -16766, 6404 }, -}; - -// LGT8 name: lgt8_150_000w2 -// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_150_000w2[8][8] = { - { 14654, 29309, 0, 0, 0, 0, 0, 0 }, - { 29309, -14654, 0, 0, 0, 0, 0, 0 }, - { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, - { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, - { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, - { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, - { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, - { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, -}; - -// LGT8 name: lgt8_100_000w2 -// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_100_000w2[8][8] = { - { 17227, 27874, 0, 0, 0, 0, 0, 0 }, - { 27874, -17227, 0, 0, 0, 0, 0, 0 }, - { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, - { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, - { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, - { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, - { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, - { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, -}; - -// LGT8 name: lgt8_060_000w2 -// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_060_000w2[8][8] = { - { 19560, 26290, 0, 0, 0, 0, 0, 0 }, - { 26290, -19560, 0, 0, 0, 0, 0, 0 }, - { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, - { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, - { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, - { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, - { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, - { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, -}; - -// LGT8 name: lgt8_000w2 -// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 0.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_000w2[8][8] = { - { 23170, 23170, 0, 0, 0, 0, 0, 0 }, - { 23170, -23170, 0, 0, 0, 0, 0, 0 }, - { 0, 0, 13377, 13377, 13377, 13377, 13377, 13377 }, - { 0, 0, 18274, 13377, 4896, -4896, -13377, -18274 }, - { 0, 0, 16384, 0, -16384, -16384, 0, 16384 }, - { 0, 0, 13377, -13377, -13377, 13377, 13377, -13377 }, - { 0, 0, 9459, -18919, 9459, 9459, -18919, 9459 }, - { 0, 0, 4896, -13377, 18274, -18274, 13377, -4896 }, -}; - -// LGT8 name: lgt8_150_000w1 -// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_150_000w1[8][8] = { - { 32768, 0, 0, 0, 0, 0, 0, 0 }, - { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, - { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, - { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, - { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, - { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, - { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, - { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, -}; - -// LGT8 name: lgt8_100_000w1 -// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_100_000w1[8][8] = { - { 32768, 0, 0, 0, 0, 0, 0, 0 }, - { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, - { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, - { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, - { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, - { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, - { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, - { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, -}; - -// LGT8 name: lgt8_060_000w1 -// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_060_000w1[8][8] = { - { 32768, 0, 0, 0, 0, 0, 0, 0 }, - { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, - { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, - { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, - { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, - { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, - { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, - { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, -}; - -// LGT8 name: lgt8_000w1 -// Self loops: 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 0.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_000w1[8][8] = { - { 32768, 0, 0, 0, 0, 0, 0, 0 }, - { 0, 12385, 12385, 12385, 12385, 12385, 12385, 12385 }, - { 0, 17076, 13694, 7600, 0, -7600, -13694, -17076 }, - { 0, 15781, 3898, -10921, -17515, -10921, 3898, 15781 }, - { 0, 13694, -7600, -17076, 0, 17076, 7600, -13694 }, - { 0, 10921, -15781, -3898, 17515, -3898, -15781, 10921 }, - { 0, 7600, -17076, 13694, 0, -13694, 17076, -7600 }, - { 0, 3898, -10921, 15781, -17515, 15781, -10921, 3898 }, -}; - -// LGT8 name: lgt8_060 -// Self loops: 0.600, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_060[8][8] = { - { 4295, 6746, 8999, 10987, 12653, 13947, 14832, 15280 }, - { 11303, 15101, 14912, 10786, 3812, -4168, -11047, -15010 }, - { 15051, 13208, 1823, -10879, -15721, -9207, 3959, 14265 }, - { 15871, 3800, -13441, -12395, 5516, 15922, 4665, -12939 }, - { 14630, -7269, -13926, 8618, 13091, -9886, -12133, 11062 }, - { 12008, -14735, 180, 14586, -12245, -4458, 15932, -8720 }, - { 8472, -15623, 14088, -4721, -7272, 15221, -14708, 6018 }, - { 4372, -9862, 13927, -15981, 15727, -13202, 8770, -3071 }, -}; - -// LGT8 name: lgt8_100 -// Self loops: 1.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_100[8][8] = { - { 2921, 5742, 8368, 10708, 12684, 14228, 15288, 15827 }, - { 8368, 14228, 15827, 12684, 5742, -2921, -10708, -15288 }, - { 12684, 15288, 5742, -8368, -15827, -10708, 2921, 14228 }, - { 15288, 8368, -10708, -14228, 2921, 15827, 5742, -12684 }, - { 15827, -2921, -15288, 5742, 14228, -8368, -12684, 10708 }, - { 14228, -12684, -2921, 15288, -10708, -5742, 15827, -8368 }, - { 10708, -15827, 12684, -2921, -8368, 15288, -14228, 5742 }, - { 5742, -10708, 14228, -15827, 15288, -12684, 8368, -2921 }, -}; -#endif // CONFIG_LGT_FROM_PRED - -#if CONFIG_LGT || CONFIG_LGT_FROM_PRED -// LGT4 name: lgt4_170 -// Self loops: 1.700, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000 -static const tran_high_t lgt4_170[4][4] = { - { 3636, 9287, 13584, 15902 }, - { 10255, 15563, 2470, -13543 }, - { 14786, 711, -15249, 9231 }, - { 14138, -14420, 10663, -3920 }, -}; - -// LGT4 name: lgt4_140 -// Self loops: 1.400, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000 -static const tran_high_t lgt4_140[4][4] = { - { 4206, 9518, 13524, 15674 }, - { 11552, 14833, 1560, -13453 }, - { 15391, -1906, -14393, 9445 }, - { 12201, -14921, 12016, -4581 }, -}; - -// LGT8 name: lgt8_170 -// Self loops: 1.700, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_170[8][8] = { - { 1858, 4947, 7850, 10458, 12672, 14411, 15607, 16217 }, - { 5494, 13022, 16256, 14129, 7343, -1864, -10456, -15601 }, - { 8887, 16266, 9500, -5529, -15749, -12273, 1876, 14394 }, - { 11870, 13351, -6199, -15984, -590, 15733, 7273, -12644 }, - { 14248, 5137, -15991, 291, 15893, -5685, -13963, 10425 }, - { 15716, -5450, -10010, 15929, -6665, -8952, 16036, -7835 }, - { 15533, -13869, 6559, 3421, -12009, 15707, -13011, 5018 }, - { 11357, -13726, 14841, -14600, 13025, -10259, 6556, -2254 }, -}; - -// LGT8 name: lgt8_150 -// Self loops: 1.500, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000 -// Edges: 1.000, 1.000, 1.000, 1.000, 1.000, 1.000, 1.000 -static const tran_high_t lgt8_150[8][8] = { - { 2075, 5110, 7958, 10511, 12677, 14376, 15544, 16140 }, - { 6114, 13307, 16196, 13845, 7015, -2084, -10509, -15534 }, - { 9816, 16163, 8717, -6168, -15790, -11936, 2104, 14348 }, - { 12928, 12326, -7340, -15653, 242, 15763, 6905, -12632 }, - { 15124, 3038, -16033, 1758, 15507, -6397, -13593, 10463 }, - { 15895, -7947, -7947, 15895, -7947, -7947, 15895, -7947 }, - { 14325, -15057, 9030, 1050, -10659, 15483, -13358, 5236 }, - { 9054, -12580, 14714, -15220, 14043, -11312, 7330, -2537 }, -}; -#endif // CONFIG_LGT || CONFIG_LGT_FROM_PRED #endif // AOM_DSP_TXFM_COMMON_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c index 3c99aa155..d367905bc 100644 --- a/third_party/aom/aom_dsp/variance.c +++ b/third_party/aom/aom_dsp/variance.c @@ -8,22 +8,24 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include #include #include -#include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" -#include "aom_ports/mem.h" #include "aom/aom_integer.h" +#include "aom_ports/mem.h" -#include "aom_dsp/variance.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/blend.h" +#include "aom_dsp/variance.h" -#include "./av1_rtcd.h" #include "av1/common/filter.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/reconinter.h" uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride) { @@ -106,12 +108,12 @@ uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). // It defines the offset required to move from one input to the next. -static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -136,12 +138,12 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b, // filter is applied horizontally (pixel_step = 1) or vertically // (pixel_step = stride). It defines the offset required to move from one input // to the next. Output is 8-bit. -static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -165,38 +167,55 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } -#define SUBPIX_VAR(W, H) \ - uint32_t aom_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ +#define SUBPIX_VAR(W, H) \ + uint32_t aom_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ } -#define SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ - \ - return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + \ + return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ + } \ + uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ } /* Identical to the variance call except it takes an additional parameter, sum, @@ -229,11 +248,9 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b, SUBPIX_VAR(W, H) \ SUBPIX_AVG_VAR(W, H) -#if CONFIG_AV1 && CONFIG_EXT_PARTITION VARIANCES(128, 128) VARIANCES(128, 64) VARIANCES(64, 128) -#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION VARIANCES(64, 64) VARIANCES(64, 32) VARIANCES(32, 64) @@ -250,19 +267,12 @@ VARIANCES(4, 4) VARIANCES(4, 2) VARIANCES(2, 4) VARIANCES(2, 2) - -#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES VARIANCES(4, 16) VARIANCES(16, 4) VARIANCES(8, 32) VARIANCES(32, 8) VARIANCES(16, 64) VARIANCES(64, 16) -#if CONFIG_EXT_PARTITION -VARIANCES(32, 128) -VARIANCES(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES GET_VAR(16, 16) GET_VAR(8, 8) @@ -288,61 +298,142 @@ void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, } // Get pred block from up-sampled reference. -void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height, +void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + // Note: This is mostly a copy from the >=8X8 case in + // build_inter_predictors() function, with some small tweaks. + + // Some assumptions. + const int plane = 0; + + // Get pre-requisites. + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = pd->subsampling_x; + const int ssy = pd->subsampling_y; + assert(ssx == 0 && ssy == 0); + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + + // Calculate subpel_x/y and x/y_step. + const int row_start = 0; // Because ss_y is 0. + const int col_start = 0; // Because ss_x is 0. + const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; + int orig_pos_y = pre_y << SUBPEL_BITS; + orig_pos_y += mv->row * (1 << (1 - ssy)); + int orig_pos_x = pre_x << SUBPEL_BITS; + orig_pos_x += mv->col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + const uint8_t *const pre = + pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + + const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, + pos_x & SCALE_SUBPEL_MASK, + pos_y & SCALE_SUBPEL_MASK }; + + // Get warp types. + const WarpedMotionParams *const wm = + &xd->global_motion[mi->ref_frame[ref_num]]; + const int is_global = is_global_mv_block(mi, wm->wmtype); + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global; + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + // Get convolve parameters. + ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + const InterpFilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + // Get the inter predictor. + const int build_for_obmc = 0; + av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width, + &subpel_params, sf, width, height, &conv_params, + filters, &warp_types, mi_x >> pd->subsampling_x, + mi_y >> pd->subsampling_y, plane, ref_num, mi, + build_for_obmc, xd, cm->allow_warped_motion); + + return; + } + } + + const InterpFilterParams filter = + av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + if (!subpel_x_q3 && !subpel_y_q3) { - int i; - for (i = 0; i < height; i++) { + for (int i = 0; i < height; i++) { memcpy(comp_pred, ref, width * sizeof(*comp_pred)); comp_pred += width; ref += ref_stride; } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, + width, height); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, + width, height); } else { - InterpFilterParams filter; - filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); - if (!subpel_y_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - /*Directly call C version to allow this to work for small (2x2) sizes.*/ - aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL, - -1, width, height); - } else if (!subpel_x_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - /*Directly call C version to allow this to work for small (2x2) sizes.*/ - aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel, - 16, width, height); - } else { - DECLARE_ALIGNED(16, uint8_t, - temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); - const int16_t *kernel_x; - const int16_t *kernel_y; - int intermediate_height; - kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; - assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - /*Directly call C versions to allow this to work for small (2x2) sizes.*/ - aom_convolve8_horiz_c(ref - ref_stride * ((filter.taps >> 1) - 1), - ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, - -1, width, intermediate_height); - aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), - MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, - 16, width, height); - } + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride, + temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height); + aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), + MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, + width, height); } } -void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, +void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride) { int i, j; - aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, - ref_stride); + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1); @@ -352,26 +443,68 @@ void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, } } -#if CONFIG_HIGHBITDEPTH -static void highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, int h, - uint64_t *sse, int64_t *sum) { +void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const JNT_COMP_PARAMS *jcp_param) { int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; - uint16_t *a = CONVERT_TO_SHORTPTR(a8); - uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + int tmp = pred[j] * bck_offset + ref[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint8_t)tmp; + } + comp_pred += width; + pred += width; + ref += ref_stride; + } +} - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { +void aom_jnt_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const JNT_COMP_PARAMS *jcp_param) { + int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride); + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint8_t)tmp; + } + comp_pred += width; + pred += width; + } +} + +static void highbd_variance64(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + const uint16_t *a = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b = CONVERT_TO_SHORTPTR(b8); + int64_t tsum = 0; + uint64_t tsse = 0; + for (int i = 0; i < h; ++i) { + int32_t lsum = 0; + for (int j = 0; j < w; ++j) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + lsum += diff; + tsse += (uint32_t)(diff * diff); } + tsum += lsum; a += a_stride; b += b_stride; } + *sum = tsum; + *sse = tsse; } uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, @@ -573,65 +706,125 @@ void aom_highbd_var_filter_block2d_bil_second_pass( dst, dst_stride, sse); \ } -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ + \ + return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ + \ + return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ + \ + return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ } /* All three forms of the variance are available in the same sizes. */ @@ -640,11 +833,9 @@ void aom_highbd_var_filter_block2d_bil_second_pass( HIGHBD_SUBPIX_VAR(W, H) \ HIGHBD_SUBPIX_AVG_VAR(W, H) -#if CONFIG_AV1 && CONFIG_EXT_PARTITION HIGHBD_VARIANCES(128, 128) HIGHBD_VARIANCES(128, 64) HIGHBD_VARIANCES(64, 128) -#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION HIGHBD_VARIANCES(64, 64) HIGHBD_VARIANCES(64, 32) HIGHBD_VARIANCES(32, 64) @@ -661,19 +852,12 @@ HIGHBD_VARIANCES(4, 4) HIGHBD_VARIANCES(4, 2) HIGHBD_VARIANCES(2, 4) HIGHBD_VARIANCES(2, 2) - -#if CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES HIGHBD_VARIANCES(4, 16) HIGHBD_VARIANCES(16, 4) HIGHBD_VARIANCES(8, 32) HIGHBD_VARIANCES(32, 8) HIGHBD_VARIANCES(16, 64) HIGHBD_VARIANCES(64, 16) -#if CONFIG_EXT_PARTITION -HIGHBD_VARIANCES(32, 128) -HIGHBD_VARIANCES(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_AV1 && CONFIG_EXT_PARTITION_TYPES HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) @@ -700,9 +884,99 @@ void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, } } -void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height, +void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, + const struct AV1Common *const cm, int mi_row, + int mi_col, const MV *const mv, + uint16_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + // Note: This is mostly a copy from the >=8X8 case in + // build_inter_predictors() function, with some small tweaks. + uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); + + // Some assumptions. + const int plane = 0; + + // Get pre-requisites. + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = pd->subsampling_x; + const int ssy = pd->subsampling_y; + assert(ssx == 0 && ssy == 0); + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + + // Calculate subpel_x/y and x/y_step. + const int row_start = 0; // Because ss_y is 0. + const int col_start = 0; // Because ss_x is 0. + const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; + int orig_pos_y = pre_y << SUBPEL_BITS; + orig_pos_y += mv->row * (1 << (1 - ssy)); + int orig_pos_x = pre_x << SUBPEL_BITS; + orig_pos_x += mv->col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + const uint8_t *const pre = + pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + + const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, + pos_x & SCALE_SUBPEL_MASK, + pos_y & SCALE_SUBPEL_MASK }; + + // Get warp types. + const WarpedMotionParams *const wm = + &xd->global_motion[mi->ref_frame[ref_num]]; + const int is_global = is_global_mv_block(mi, wm->wmtype); + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global; + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + // Get convolve parameters. + ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + const InterpFilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + // Get the inter predictor. + const int build_for_obmc = 0; + av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width, + &subpel_params, sf, width, height, &conv_params, + filters, &warp_types, mi_x >> pd->subsampling_x, + mi_y >> pd->subsampling_y, plane, ref_num, mi, + build_for_obmc, xd, cm->allow_warped_motion); + + return; + } + } + + const InterpFilterParams filter = + av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + if (!subpel_x_q3 && !subpel_y_q3) { const uint16_t *ref; int i; @@ -712,57 +986,48 @@ void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height, comp_pred += width; ref += ref_stride; } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), + width, kernel, 16, NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), + width, NULL, -1, kernel, 16, width, height, bd); } else { - InterpFilterParams filter; - filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); - if (!subpel_y_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - /*Directly call C version to allow this to work for small (2x2) sizes.*/ - aom_highbd_convolve8_horiz_c(ref8, ref_stride, - CONVERT_TO_BYTEPTR(comp_pred), width, kernel, - 16, NULL, -1, width, height, bd); - } else if (!subpel_x_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - /*Directly call C version to allow this to work for small (2x2) sizes.*/ - aom_highbd_convolve8_vert_c(ref8, ref_stride, - CONVERT_TO_BYTEPTR(comp_pred), width, NULL, - -1, kernel, 16, width, height, bd); - } else { - DECLARE_ALIGNED(16, uint16_t, - temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); - const int16_t *kernel_x; - const int16_t *kernel_y; - int intermediate_height; - kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; - assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - /*Directly call C versions to allow this to work for small (2x2) sizes.*/ - aom_highbd_convolve8_horiz_c(ref8 - ref_stride * ((filter.taps >> 1) - 1), - ref_stride, CONVERT_TO_BYTEPTR(temp), - MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height, bd); - aom_highbd_convolve8_vert_c( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), - MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, - 16, width, height, bd); - } + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), + ref_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), + MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, + 16, width, height, bd); } } -void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, - const uint8_t *pred8, int width, - int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd) { +void aom_highbd_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd) { int i, j; const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, - ref8, ref_stride, bd); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1); @@ -771,69 +1036,109 @@ void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred, pred += width; } } -#endif // CONFIG_HIGHBITDEPTH -#if CONFIG_AV1 -void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride, - const uint8_t *mask, int mask_stride, - int invert_mask) { +void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, + const JNT_COMP_PARAMS *jcp_param) { int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { - if (!invert_mask) - comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]); - else - comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]); + int tmp = pred[j] * bck_offset + ref[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint16_t)tmp; } comp_pred += width; pred += width; ref += ref_stride; - mask += mask_stride; } } -void aom_comp_mask_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref, - int ref_stride, const uint8_t *mask, - int mask_stride, int invert_mask) { +void aom_highbd_jnt_comp_avg_upsampled_pred_c( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) { int i, j; + const int fwd_offset = jcp_param->fwd_offset; + const int bck_offset = jcp_param->bck_offset; + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd); - aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, - ref_stride); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { - if (!invert_mask) - comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]); - else - comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]); + int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset; + tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS); + comp_pred[j] = (uint16_t)tmp; } comp_pred += width; pred += width; + } +} + +void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const uint8_t *mask, int mask_stride, + int invert_mask) { + int i, j; + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + for (i = 0; i < height; ++i) { + for (j = 0; j < width; ++j) { + comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]); + } + comp_pred += width; + src0 += stride0; + src1 += stride1; mask += mask_stride; } } -#define MASK_SUBPIX_VAR(W, H) \ - unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ - invert_mask); \ - return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ +void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + if (subpel_x_q3 | subpel_y_q3) { + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride); + ref = comp_pred; + ref_stride = width; + } + aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); +} + +#define MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \ + W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ + invert_mask); \ + return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ } MASK_SUBPIX_VAR(4, 4) @@ -849,26 +1154,16 @@ MASK_SUBPIX_VAR(32, 32) MASK_SUBPIX_VAR(32, 64) MASK_SUBPIX_VAR(64, 32) MASK_SUBPIX_VAR(64, 64) -#if CONFIG_EXT_PARTITION MASK_SUBPIX_VAR(64, 128) MASK_SUBPIX_VAR(128, 64) MASK_SUBPIX_VAR(128, 128) -#endif // CONFIG_EXT_PARTITION - -#if CONFIG_EXT_PARTITION_TYPES MASK_SUBPIX_VAR(4, 16) MASK_SUBPIX_VAR(16, 4) MASK_SUBPIX_VAR(8, 32) MASK_SUBPIX_VAR(32, 8) MASK_SUBPIX_VAR(16, 64) MASK_SUBPIX_VAR(64, 16) -#if CONFIG_EXT_PARTITION -MASK_SUBPIX_VAR(32, 128) -MASK_SUBPIX_VAR(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES -#if CONFIG_HIGHBITDEPTH void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, @@ -891,14 +1186,17 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, } void aom_highbd_comp_mask_upsampled_pred_c( - uint16_t *comp_pred, const uint8_t *pred8, int width, int height, - int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, - const uint8_t *mask, int mask_stride, int invert_mask, int bd) { + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int bd) { int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, - ref8, ref_stride, bd); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { if (!invert_mask) @@ -992,28 +1290,16 @@ HIGHBD_MASK_SUBPIX_VAR(32, 32) HIGHBD_MASK_SUBPIX_VAR(32, 64) HIGHBD_MASK_SUBPIX_VAR(64, 32) HIGHBD_MASK_SUBPIX_VAR(64, 64) -#if CONFIG_EXT_PARTITION HIGHBD_MASK_SUBPIX_VAR(64, 128) HIGHBD_MASK_SUBPIX_VAR(128, 64) HIGHBD_MASK_SUBPIX_VAR(128, 128) -#endif // CONFIG_EXT_PARTITION - -#if CONFIG_EXT_PARTITION_TYPES HIGHBD_MASK_SUBPIX_VAR(4, 16) HIGHBD_MASK_SUBPIX_VAR(16, 4) HIGHBD_MASK_SUBPIX_VAR(8, 32) HIGHBD_MASK_SUBPIX_VAR(32, 8) HIGHBD_MASK_SUBPIX_VAR(16, 64) HIGHBD_MASK_SUBPIX_VAR(64, 16) -#if CONFIG_EXT_PARTITION -HIGHBD_MASK_SUBPIX_VAR(32, 128) -HIGHBD_MASK_SUBPIX_VAR(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES -#endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_AV1 - -#if CONFIG_AV1 && CONFIG_MOTION_VAR + static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, unsigned int *sse, int *sum) { @@ -1044,19 +1330,19 @@ static INLINE void obmc_variance(const uint8_t *pre, int pre_stride, return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } -#define OBMC_SUBPIX_VAR(W, H) \ - unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ - const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ +#define OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \ + W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ } OBMC_VAR(4, 4) @@ -1098,7 +1384,6 @@ OBMC_SUBPIX_VAR(64, 32) OBMC_VAR(64, 64) OBMC_SUBPIX_VAR(64, 64) -#if CONFIG_EXT_PARTITION OBMC_VAR(64, 128) OBMC_SUBPIX_VAR(64, 128) @@ -1107,9 +1392,7 @@ OBMC_SUBPIX_VAR(128, 64) OBMC_VAR(128, 128) OBMC_SUBPIX_VAR(128, 128) -#endif // CONFIG_EXT_PARTITION -#if CONFIG_EXT_PARTITION_TYPES OBMC_VAR(4, 16) OBMC_SUBPIX_VAR(4, 16) OBMC_VAR(16, 4) @@ -1122,15 +1405,7 @@ OBMC_VAR(16, 64) OBMC_SUBPIX_VAR(16, 64) OBMC_VAR(64, 16) OBMC_SUBPIX_VAR(64, 16) -#if CONFIG_EXT_PARTITION -OBMC_VAR(32, 128) -OBMC_SUBPIX_VAR(32, 128) -OBMC_VAR(128, 32) -OBMC_SUBPIX_VAR(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES - -#if CONFIG_HIGHBITDEPTH + static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride, const int32_t *wsrc, const int32_t *mask, int w, int h, @@ -1301,7 +1576,6 @@ HIGHBD_OBMC_SUBPIX_VAR(64, 32) HIGHBD_OBMC_VAR(64, 64) HIGHBD_OBMC_SUBPIX_VAR(64, 64) -#if CONFIG_EXT_PARTITION HIGHBD_OBMC_VAR(64, 128) HIGHBD_OBMC_SUBPIX_VAR(64, 128) @@ -1310,9 +1584,7 @@ HIGHBD_OBMC_SUBPIX_VAR(128, 64) HIGHBD_OBMC_VAR(128, 128) HIGHBD_OBMC_SUBPIX_VAR(128, 128) -#endif // CONFIG_EXT_PARTITION -#if CONFIG_EXT_PARTITION_TYPES HIGHBD_OBMC_VAR(4, 16) HIGHBD_OBMC_SUBPIX_VAR(4, 16) HIGHBD_OBMC_VAR(16, 4) @@ -1325,12 +1597,3 @@ HIGHBD_OBMC_VAR(16, 64) HIGHBD_OBMC_SUBPIX_VAR(16, 64) HIGHBD_OBMC_VAR(64, 16) HIGHBD_OBMC_SUBPIX_VAR(64, 16) -#if CONFIG_EXT_PARTITION -HIGHBD_OBMC_VAR(32, 128) -HIGHBD_OBMC_SUBPIX_VAR(32, 128) -HIGHBD_OBMC_VAR(128, 32) -HIGHBD_OBMC_SUBPIX_VAR(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES -#endif // CONFIG_HIGHBITDEPTH -#endif // CONFIG_AV1 && CONFIG_MOTION_VAR diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h index a193df467..544dda944 100644 --- a/third_party/aom/aom_dsp/variance.h +++ b/third_party/aom/aom_dsp/variance.h @@ -12,7 +12,7 @@ #ifndef AOM_DSP_VARIANCE_H_ #define AOM_DSP_VARIANCE_H_ -#include "./aom_config.h" +#include "config/aom_config.h" #include "aom/aom_integer.h" @@ -33,10 +33,6 @@ typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride, typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b, int b_stride, int n); -typedef void (*aom_sad_multi_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sad_array); - typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride, const uint8_t *const b_array[], int b_stride, unsigned int *sad_array); @@ -54,7 +50,16 @@ typedef unsigned int (*aom_subp_avg_variance_fn_t)( const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, unsigned int *sse, const uint8_t *second_pred); -#if CONFIG_AV1 +typedef unsigned int (*aom_jnt_sad_avg_fn_t)(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + const uint8_t *second_pred, + const JNT_COMP_PARAMS *jcp_param); + +typedef unsigned int (*aom_jnt_subp_avg_variance_fn_t)( + const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, + int b_stride, unsigned int *sse, const uint8_t *second_pred, + const JNT_COMP_PARAMS *jcp_param); + typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, @@ -64,9 +69,13 @@ typedef unsigned int (*aom_masked_subpixvariance_fn_t)( const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); -#endif // CONFIG_AV1 -#if CONFIG_AV1 && CONFIG_MOTION_VAR +void aom_comp_mask_upsampled_pred( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); + typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, const int32_t *wsrc, const int32_t *msk); @@ -78,27 +87,22 @@ typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred, typedef unsigned int (*aom_obmc_subpixvariance_fn_t)( const uint8_t *pred, int pred_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *msk, unsigned int *sse); -#endif // CONFIG_AV1 && CONFIG_MOTION_VAR -#if CONFIG_AV1 typedef struct aom_variance_vtable { aom_sad_fn_t sdf; aom_sad_avg_fn_t sdaf; aom_variance_fn_t vf; aom_subpixvariance_fn_t svf; aom_subp_avg_variance_fn_t svaf; - aom_sad_multi_fn_t sdx3f; - aom_sad_multi_fn_t sdx8f; aom_sad_multi_d_fn_t sdx4df; aom_masked_sad_fn_t msdf; aom_masked_subpixvariance_fn_t msvf; -#if CONFIG_MOTION_VAR aom_obmc_sad_fn_t osdf; aom_obmc_variance_fn_t ovf; aom_obmc_subpixvariance_fn_t osvf; -#endif // CONFIG_MOTION_VAR + aom_jnt_sad_avg_fn_t jsdaf; + aom_jnt_subp_avg_variance_fn_t jsvaf; } aom_variance_fn_ptr_t; -#endif // CONFIG_AV1 void aom_highbd_var_filter_block2d_bil_first_pass( const uint8_t *src_ptr8, uint16_t *output_ptr, @@ -115,10 +119,8 @@ void aom_highbd_var_filter_block2d_bil_second_pass( uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h); -#if CONFIG_HIGHBITDEPTH uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h); -#endif // CONFIG_HIGHBITDEPTH #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c index 4067b0b53..401fbdc48 100644 --- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c +++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c @@ -9,8 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/x86/convolve.h" #if HAVE_SSE2 @@ -20,12 +21,6 @@ filter8_1dfunction aom_filter_block1d8_v8_sse2; filter8_1dfunction aom_filter_block1d8_h8_sse2; filter8_1dfunction aom_filter_block1d4_v8_sse2; filter8_1dfunction aom_filter_block1d4_h8_sse2; -filter8_1dfunction aom_filter_block1d16_v8_avg_sse2; -filter8_1dfunction aom_filter_block1d16_h8_avg_sse2; -filter8_1dfunction aom_filter_block1d8_v8_avg_sse2; -filter8_1dfunction aom_filter_block1d8_h8_avg_sse2; -filter8_1dfunction aom_filter_block1d4_v8_avg_sse2; -filter8_1dfunction aom_filter_block1d4_h8_avg_sse2; filter8_1dfunction aom_filter_block1d16_v2_sse2; filter8_1dfunction aom_filter_block1d16_h2_sse2; @@ -33,12 +28,6 @@ filter8_1dfunction aom_filter_block1d8_v2_sse2; filter8_1dfunction aom_filter_block1d8_h2_sse2; filter8_1dfunction aom_filter_block1d4_v2_sse2; filter8_1dfunction aom_filter_block1d4_h2_sse2; -filter8_1dfunction aom_filter_block1d16_v2_avg_sse2; -filter8_1dfunction aom_filter_block1d16_h2_avg_sse2; -filter8_1dfunction aom_filter_block1d8_v2_avg_sse2; -filter8_1dfunction aom_filter_block1d8_h2_avg_sse2; -filter8_1dfunction aom_filter_block1d4_v2_avg_sse2; -filter8_1dfunction aom_filter_block1d4_h2_avg_sse2; // void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -50,47 +39,16 @@ filter8_1dfunction aom_filter_block1d4_h2_avg_sse2; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void aom_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); - -// void aom_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, sse2); -FUN_CONV_2D(avg_, sse2); -#if CONFIG_HIGHBITDEPTH && ARCH_X86_64 +#if ARCH_X86_64 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_avg_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2; @@ -98,12 +56,6 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_avg_sse2; -highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2; // void aom_highbd_convolve8_horiz_sse2(const uint8_t *src, // ptrdiff_t src_stride, @@ -123,60 +75,8 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -// void aom_highbd_convolve8_avg_horiz_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); -// void aom_highbd_convolve8_avg_vert_sse2(const uint8_t *src, -// ptrdiff_t src_stride, -// uint8_t *dst, -// ptrdiff_t dst_stride, -// const int16_t *filter_x, -// int x_step_q4, -// const int16_t *filter_y, -// int y_step_q4, -// int w, int h, int bd); HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - sse2); - -// void aom_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); -// void aom_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); -HIGH_FUN_CONV_2D(, sse2); -HIGH_FUN_CONV_2D(avg_, sse2); -#if CONFIG_LOOP_RESTORATION -// The SSE2 highbd convolve functions can deal with coefficients up to 32767. -// So redirect highbd_convolve8_add_src to regular highbd_convolve8. -void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd) { - assert(x_step_q4 == 16); - assert(y_step_q4 == 16); - ((int16_t *)filter_x)[3] += 128; - ((int16_t *)filter_y)[3] += 128; - aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); - ((int16_t *)filter_x)[3] -= 128; - ((int16_t *)filter_y)[3] -= 128; -} -#endif // CONFIG_LOOP_RESTORATION -#endif // CONFIG_HIGHBITDEPTH && ARCH_X86_64 +#endif // ARCH_X86_64 #endif // HAVE_SSE2 diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm index 4d3142867..7283c32b8 100644 --- a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm +++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm @@ -50,7 +50,6 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ cmp r4d, 32 je .w32 -%if CONFIG_AV1 && CONFIG_EXT_PARTITION cmp r4d, 64 je .w64 %ifidn %2, highbd @@ -160,50 +159,6 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ jnz .loop128 RET -%else ; CONFIG_AV1 && CONFIG_EXT_PARTITION - -%ifidn %2, highbd - cmp r4d, 64 - je .w64 - - mov r4d, dword hm -.loop128: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+32] - movu m3, [srcq+48] -%ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+16] - pavg m2, [dstq+32] - pavg m3, [dstq+48] -%endif - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - movu m0, [srcq+64] - movu m1, [srcq+80] - movu m2, [srcq+96] - movu m3, [srcq+112] - add srcq, src_strideq -%ifidn %1, avg - pavg m0, [dstq+64] - pavg m1, [dstq+80] - pavg m2, [dstq+96] - pavg m3, [dstq+112] -%endif - mova [dstq+64], m0 - mova [dstq+80], m1 - mova [dstq+96], m2 - mova [dstq+112], m3 - add dstq, dst_strideq - sub r4d, 1 - jnz .loop128 - RET -%endif -%endif ; CONFIG_AV1 && CONFIG_EXT_PARTITION - .w64: mov r4d, dword hm .loop64: @@ -339,7 +294,4 @@ cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ INIT_XMM sse2 convolve_fn copy convolve_fn avg -%if CONFIG_HIGHBITDEPTH convolve_fn copy, highbd -convolve_fn avg, highbd -%endif diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c b/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c deleted file mode 100644 index 14352895d..000000000 --- a/third_party/aom/aom_dsp/x86/aom_convolve_hip_sse2.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" - -void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - const int bd = 8; - assert(x_step_q4 == 16 && y_step_q4 == 16); - assert(!(w & 7)); - (void)x_step_q4; - (void)y_step_q4; - - uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]; - int intermediate_height = h + SUBPEL_TAPS - 1; - int i, j; - const int center_tap = ((SUBPEL_TAPS - 1) / 2); - const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; - - const __m128i zero = _mm_setzero_si128(); - // Add an offset to account for the "add_src" part of the convolve function. - const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); - - /* Horizontal filter */ - { - const __m128i coeffs_x = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) + - (1 << (bd + FILTER_BITS - 1))); - - for (i = 0; i < intermediate_height; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - - // Filter even-index pixels - const __m128i src_0 = _mm_unpacklo_epi8(data, zero); - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), - FILTER_BITS - EXTRAPREC_BITS); - - // Filter odd-index pixels - const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), - FILTER_BITS - EXTRAPREC_BITS); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m128i res = _mm_packs_epi32(res_even, res_odd); - res = _mm_min_epi16(_mm_max_epi16(res, zero), - _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1)); - _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); - } - } - } - - /* Vertical filter */ - { - const __m128i coeffs_y = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) - - (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1))); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS); - const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS); - - const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); - __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); - - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - _mm_storel_epi64(p, res_8bit); - } - } - } -} diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm index e6d357ba3..b6f040791 100644 --- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm +++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm @@ -200,6 +200,8 @@ movdqu [rdi + %2], xmm0 %endm +SECTION .text + ;void aom_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, @@ -392,169 +394,6 @@ sym(aom_highbd_filter_block1d16_v8_sse2): pop rbp ret -global sym(aom_highbd_filter_block1d4_v8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 7 - %define k0k6 [rsp + 16 * 0] - %define k2k5 [rsp + 16 * 1] - %define k3k4 [rsp + 16 * 2] - %define k1k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define max [rsp + 16 * 5] - %define min [rsp + 16 * 6] - - HIGH_GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rbx, [rbx + rbx] - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movq xmm0, [rsi] ;load src: row 0 - movq xmm1, [rsi + rax] ;1 - movq xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movq xmm7, [rsi + rdx * 2] ;7 - movq xmm2, [rsi + rax] ;2 - movq xmm3, [rsi + rax * 2] ;3 - movq xmm4, [rsi + rdx] ;4 - movq xmm5, [rsi + rax * 4] ;5 - - HIGH_APPLY_FILTER_4 1 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 7 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d8_v8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rbx, [rbx + rbx] - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height -.loop: - LOAD_VERT_8 0 - HIGH_APPLY_FILTER_8 1, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d16_v8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rbx, [rbx + rbx] - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height -.loop: - LOAD_VERT_8 0 - HIGH_APPLY_FILTER_8 1, 0 - sub rsi, rax - - LOAD_VERT_8 16 - HIGH_APPLY_FILTER_8 1, 16 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - ;void aom_filter_block1d4_h8_sse2 ;( ; unsigned char *src_ptr, @@ -772,194 +611,3 @@ sym(aom_highbd_filter_block1d16_h8_sse2): UNSHADOW_ARGS pop rbp ret - -global sym(aom_highbd_filter_block1d4_h8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 7 - %define k0k6 [rsp + 16 * 0] - %define k2k5 [rsp + 16 * 1] - %define k3k4 [rsp + 16 * 2] - %define k1k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define max [rsp + 16 * 5] - %define min [rsp + 16 * 6] - - HIGH_GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rdx, [rdx + rdx] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 6] ;load src - movdqu xmm4, [rsi + 2] - movdqa xmm1, xmm0 - movdqa xmm6, xmm4 - movdqa xmm7, xmm4 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm4 - - psrldq xmm1, 2 - psrldq xmm6, 4 - psrldq xmm7, 6 - psrldq xmm2, 4 - psrldq xmm3, 6 - psrldq xmm5, 2 - - HIGH_APPLY_FILTER_4 1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 7 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d8_h8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rdx, [rdx + rdx] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 6] ;load src - movdqu xmm1, [rsi - 4] - movdqu xmm2, [rsi - 2] - movdqu xmm3, [rsi] - movdqu xmm4, [rsi + 2] - movdqu xmm5, [rsi + 4] - movdqu xmm6, [rsi + 6] - movdqu xmm7, [rsi + 8] - - HIGH_APPLY_FILTER_8 1, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d16_h8_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 8 - %define k0k1 [rsp + 16 * 0] - %define k6k7 [rsp + 16 * 1] - %define k2k5 [rsp + 16 * 2] - %define k3k4 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define temp [rsp + 16 * 5] - %define max [rsp + 16 * 6] - %define min [rsp + 16 * 7] - - HIGH_GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - lea rax, [rax + rax] ;bytes per line - lea rdx, [rdx + rdx] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 6] ;load src - movdqu xmm1, [rsi - 4] - movdqu xmm2, [rsi - 2] - movdqu xmm3, [rsi] - movdqu xmm4, [rsi + 2] - movdqu xmm5, [rsi + 4] - movdqu xmm6, [rsi + 6] - movdqu xmm7, [rsi + 8] - - HIGH_APPLY_FILTER_8 1, 0 - - movdqu xmm0, [rsi + 10] ;load src - movdqu xmm1, [rsi + 12] - movdqu xmm2, [rsi + 14] - movdqu xmm3, [rsi + 16] - movdqu xmm4, [rsi + 18] - movdqu xmm5, [rsi + 20] - movdqu xmm6, [rsi + 22] - movdqu xmm7, [rsi + 24] - - HIGH_APPLY_FILTER_8 1, 16 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 8 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm index 9e2ec748c..7b3fe6419 100644 --- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm +++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm @@ -174,6 +174,8 @@ %endm %endif +SECTION .text + global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE sym(aom_highbd_filter_block1d4_v2_sse2): push rbp @@ -254,86 +256,6 @@ sym(aom_highbd_filter_block1d16_v2_sse2): ret %endif -global sym(aom_highbd_filter_block1d4_v2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM_4 -.loop: - movq xmm0, [rsi] ;load src - movq xmm1, [rsi + 2*rax] - - HIGH_APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -%if ARCH_X86_64 -global sym(aom_highbd_filter_block1d8_v2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 8 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + 2*rax] ;1 - - HIGH_APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d16_v2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 9 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + 2*rax] ;1 - movdqu xmm2, [rsi + 16] - movdqu xmm3, [rsi + 2*rax + 16] - - HIGH_APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%endif - global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE sym(aom_highbd_filter_block1d4_h2_sse2): push rbp @@ -414,84 +336,3 @@ sym(aom_highbd_filter_block1d16_h2_sse2): pop rbp ret %endif - -global sym(aom_highbd_filter_block1d4_h2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d4_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 2 - - HIGH_APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -%if ARCH_X86_64 -global sym(aom_highbd_filter_block1d8_h2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d8_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 8 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 2] - - HIGH_APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_highbd_filter_block1d16_h2_avg_sse2) PRIVATE -sym(aom_highbd_filter_block1d16_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 9 - push rsi - push rdi - ; end prolog - - HIGH_GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 2] - movdqu xmm2, [rsi + 16] - movdqu xmm3, [rsi + 18] - - HIGH_APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%endif diff --git a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c b/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c deleted file mode 100644 index 74ce80e50..000000000 --- a/third_party/aom/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/aom_convolve.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_dsp/aom_filter.h" - -#if EXTRAPREC_BITS > 2 -#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2" -#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)" -#endif - -void aom_highbd_convolve8_add_src_hip_ssse3( - const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { - assert(x_step_q4 == 16 && y_step_q4 == 16); - assert(!(w & 7)); - (void)x_step_q4; - (void)y_step_q4; - - const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); - uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); - - uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]; - int intermediate_height = h + SUBPEL_TAPS - 1; - int i, j; - const int center_tap = ((SUBPEL_TAPS - 1) / 2); - const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; - - const __m128i zero = _mm_setzero_si128(); - // Add an offset to account for the "add_src" part of the convolve function. - const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); - - /* Horizontal filter */ - { - const __m128i coeffs_x = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) + - (1 << (bd + FILTER_BITS - 1))); - - for (i = 0; i < intermediate_height; ++i) { - for (j = 0; j < w; j += 8) { - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - const __m128i data2 = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); - - // Filter even-index pixels - const __m128i res_0 = _mm_madd_epi16(data, coeff_01); - const __m128i res_2 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); - const __m128i res_4 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); - const __m128i res_6 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), - FILTER_BITS - EXTRAPREC_BITS); - - // Filter odd-index pixels - const __m128i res_1 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); - const __m128i res_3 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); - const __m128i res_5 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); - const __m128i res_7 = - _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), - FILTER_BITS - EXTRAPREC_BITS); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - const __m128i maxval = _mm_set1_epi16((EXTRAPREC_CLAMP_LIMIT(bd)) - 1); - __m128i res = _mm_packs_epi32(res_even, res_odd); - res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); - _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); - } - } - } - - /* Vertical filter */ - { - const __m128i coeffs_y = - _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = - _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) - - (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1))); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), - *(__m128i *)(data + 1 * MAX_SB_SIZE)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), - *(__m128i *)(data + 3 * MAX_SB_SIZE)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), - *(__m128i *)(data + 5 * MAX_SB_SIZE)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), - *(__m128i *)(data + 7 * MAX_SB_SIZE)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = _mm_srai_epi32( - _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS); - const __m128i res_hi_round = _mm_srai_epi32( - _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS); - - const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); - __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); - res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); - - __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; - _mm_storeu_si128(p, res_16bit); - } - } - } -} diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c index 61476b8be..af45a03ac 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -11,31 +11,12 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_avx2.h" #include "aom_ports/mem.h" -// filters for 16_h8 and 16_v8 -DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 -}; - #if defined(__clang__) #if (__clang_major__ > 0 && __clang_major__ < 3) || \ (__clang_major__ == 3 && __clang_minor__ <= 3) || \ @@ -566,10 +547,4 @@ filter8_1dfunction aom_filter_block1d4_h2_ssse3; FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); -// void aom_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, avx2); #endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c index be37738df..6bcb4a512 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -11,7 +11,8 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve.h" #include "aom_mem/aom_mem.h" @@ -285,20 +286,6 @@ filter8_1dfunction aom_filter_block1d8_v8_ssse3; filter8_1dfunction aom_filter_block1d8_h8_ssse3; filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d4_h8_ssse3; -filter8_1dfunction aom_filter_block1d16_v8_avg_ssse3; -filter8_1dfunction aom_filter_block1d16_h8_avg_ssse3; -filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3; -filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3; -#if CONFIG_LOOP_RESTORATION -filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3; -filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3; -filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3; -filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3; -#endif filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; @@ -306,12 +293,6 @@ filter8_1dfunction aom_filter_block1d8_v2_ssse3; filter8_1dfunction aom_filter_block1d8_h2_ssse3; filter8_1dfunction aom_filter_block1d4_v2_ssse3; filter8_1dfunction aom_filter_block1d4_h2_ssse3; -filter8_1dfunction aom_filter_block1d16_v2_avg_ssse3; -filter8_1dfunction aom_filter_block1d16_h2_avg_ssse3; -filter8_1dfunction aom_filter_block1d8_v2_avg_ssse3; -filter8_1dfunction aom_filter_block1d8_h2_avg_ssse3; -filter8_1dfunction aom_filter_block1d4_v2_avg_ssse3; -filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3; // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, @@ -323,598 +304,5 @@ filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3; // const int16_t *filter_x, int x_step_q4, // const int16_t *filter_y, int y_step_q4, // int w, int h); -// void aom_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); - -#if CONFIG_LOOP_RESTORATION -FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_, - ssse3); -FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v, - src - src_stride * 3, add_src_, ssse3); -#endif - -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \ - const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \ - const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \ - const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \ - \ - const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \ - const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \ - const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \ - const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \ - \ - out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \ - out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \ - out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \ - out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \ - out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \ - out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \ - out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \ - out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \ - } - -static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *x_filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 - const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); - // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 - const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 - const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); - // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 - const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); - const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); - const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); - const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 8 bytes convolve result - _mm_storel_epi64((__m128i *)dst, temp); -} - -static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A, B, C, D, E, F, G, H; - - A = _mm_loadl_epi64((const __m128i *)src); - B = _mm_loadl_epi64((const __m128i *)(src + src_stride)); - C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)); - D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3)); - E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4)); - F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5)); - G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6)); - H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7)); - - TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H); - - _mm_storel_epi64((__m128i *)dst, A); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G); - _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H); -} - -static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { - DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); - int x, y, z; - src -= SUBPEL_TAPS / 2 - 1; - - // This function processes 8x8 areas. The intermediate height is not always - // a multiple of 8, so force it to be a multiple of 8 here. - y = h + (8 - (h & 0x7)); - - do { - int x_q4 = x0_q4; - for (x = 0; x < w; x += 8) { - // process 8 src_x steps - for (z = 0; z < 8; ++z) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - if (x_q4 & SUBPEL_MASK) { - filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter); - } else { - int i; - for (i = 0; i < 8; ++i) { - temp[z * 8 + i] = src_x[i * src_stride + 3]; - } - } - x_q4 += x_step_q4; - } - - // transpose the 8x8 filters values back to dst - transpose8x8_to_dst(temp, 8, dst + x, dst_stride); - } - - src += src_stride * 8; - dst += dst_stride * 8; - } while (y -= 8); -} - -static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - // TRANSPOSE... - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // - // TO - // - // 00 10 20 30 - // 01 11 21 31 - // 02 12 22 32 - // 03 13 23 33 - // 04 14 24 34 - // 05 15 25 35 - // 06 16 26 36 - // 07 17 27 37 - // - // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 - const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); - // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 - const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); - // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 - const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 - const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 02 03 12 13 22 23 32 33 - const __m128i s3s2 = _mm_srli_si128(s1s0, 8); - // 06 07 16 17 26 27 36 37 - const __m128i s7s6 = _mm_srli_si128(s5s4, 8); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 4 bytes - *(int *)dst = _mm_cvtsi128_si32(temp); -} - -static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride) { - __m128i A = _mm_cvtsi32_si128(*(const int *)src); - __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); - __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); - __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); - // 00 10 01 11 02 12 03 13 - const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); - // 20 30 21 31 22 32 23 33 - const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - A = _mm_unpacklo_epi16(tr0_0, tr0_1); - B = _mm_srli_si128(A, 4); - C = _mm_srli_si128(A, 8); - D = _mm_srli_si128(A, 12); - - *(int *)(dst) = _mm_cvtsi128_si32(A); - *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); - *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); - *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); -} - -static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *x_filters, int x0_q4, - int x_step_q4, int w, int h) { - DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); - int x, y, z; - src -= SUBPEL_TAPS / 2 - 1; - - for (y = 0; y < h; y += 4) { - int x_q4 = x0_q4; - for (x = 0; x < w; x += 4) { - // process 4 src_x steps - for (z = 0; z < 4; ++z) { - const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; - const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; - if (x_q4 & SUBPEL_MASK) { - filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter); - } else { - int i; - for (i = 0; i < 4; ++i) { - temp[z * 4 + i] = src_x[i * src_stride + 3]; - } - } - x_q4 += x_step_q4; - } - - // transpose the 4x4 filters values back to dst - transpose4x4_to_dst(temp, 4, dst + x, dst_stride); - } - - src += src_stride * 4; - dst += dst_stride * 4; - } -} - -static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr); - const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch)); - const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 4 bytes - *(int *)dst = _mm_cvtsi128_si32(temp); -} - -static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { - int y; - int y_q4 = y0_q4; - - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - - if (y_q4 & SUBPEL_MASK) { - filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); - } else { - memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); - } - - y_q4 += y_step_q4; - } -} - -static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); - const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); - const __m128i s1s0 = _mm_unpacklo_epi8(A, B); - const __m128i s3s2 = _mm_unpacklo_epi8(C, D); - const __m128i s5s4 = _mm_unpacklo_epi8(E, F); - const __m128i s7s6 = _mm_unpacklo_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); - const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); - const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); - const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); - // add and saturate the results together - const __m128i min_x2x1 = _mm_min_epi16(x2, x1); - const __m128i max_x2x1 = _mm_max_epi16(x2, x1); - __m128i temp = _mm_adds_epi16(x0, x3); - temp = _mm_adds_epi16(temp, min_x2x1); - temp = _mm_adds_epi16(temp, max_x2x1); - // round and shift by 7 bit each 16 bit - temp = _mm_mulhrs_epi16(temp, k_256); - // shrink to 8 bit each 16 bits - temp = _mm_packus_epi16(temp, temp); - // save only 8 bytes convolve result - _mm_storel_epi64((__m128i *)dst, temp); -} - -static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { - int y; - int y_q4 = y0_q4; - - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - if (y_q4 & SUBPEL_MASK) { - filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter); - } else { - memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); - } - y_q4 += y_step_q4; - } -} - -static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *dst, const int16_t *filter, int w) { - const __m128i k_256 = _mm_set1_epi16(1 << 8); - const __m128i f_values = _mm_load_si128((const __m128i *)filter); - // pack and duplicate the filter values - const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); - const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); - const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); - const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); - int i; - - for (i = 0; i < w; i += 16) { - const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); - const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); - const __m128i C = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - const __m128i D = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - const __m128i E = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - const __m128i F = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - const __m128i G = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - const __m128i H = - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - // merge the result together - const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); - const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); - const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); - const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); - const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); - const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); - const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); - // add and saturate the results together - const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); - const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); - // merge the result together - const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); - const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); - const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); - // merge the result together - const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); - const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); - // multiply 2 adjacent elements with the filter and add the result - const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); - const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); - // add and saturate the results together - __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); - __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); - - // add and saturate the results together - temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); - temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); - // round and shift by 7 bit each 16 bit - temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); - temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - temp_hi = _mm_packus_epi16(temp_lo, temp_hi); - src_ptr += 16; - // save 16 bytes convolve result - _mm_store_si128((__m128i *)&dst[i], temp_hi); - } -} - -static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *y_filters, int y0_q4, - int y_step_q4, int w, int h) { - int y; - int y_q4 = y0_q4; - - src -= src_stride * (SUBPEL_TAPS / 2 - 1); - for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; - const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; - if (y_q4 & SUBPEL_MASK) { - filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter, - w); - } else { - memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w); - } - y_q4 += y_step_q4; - } -} - -static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h) { - // Note: Fixed size intermediate buffer, temp, places limits on parameters. - // 2d filtering proceeds in 2 steps: - // (1) Interpolate horizontally into an intermediate buffer, temp. - // (2) Interpolate temp vertically to derive the sub-pixel result. - // Deriving the maximum number of rows in the temp buffer (135): - // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). - // --Largest block size is 64x64 pixels. - // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the - // original frame (in 1/16th pixel units). - // --Must round-up because block may be located at sub-pixel position. - // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. - // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - // --Require an additional 8 rows for the horiz_w8 transpose tail. - DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]); - const int intermediate_height = - (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; - - assert(w <= MAX_SB_SIZE); - assert(h <= MAX_SB_SIZE); - assert(y_step_q4 <= 32); - assert(x_step_q4 <= 32); - - if (w >= 8) { - scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, - x_step_q4, w, intermediate_height); - } else { - scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4, - x_step_q4, w, intermediate_height); - } - - if (w >= 16) { - scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, - y_step_q4, w, h); - } else if (w == 8) { - scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, - y_step_q4, w, h); - } else { - scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), - MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, - y_step_q4, w, h); - } -} - -static const InterpKernel *get_filter_base(const int16_t *filter) { - // NOTE: This assumes that the filter table is 256-byte aligned. - // TODO(agrange) Modify to make independent of table alignment. - return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); -} - -static int get_filter_offset(const int16_t *f, const InterpKernel *base) { - return (int)((const InterpKernel *)(intptr_t)f - base); -} - -void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, filters_y, y0_q4, y_step_q4, w, h); -} - -// void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_2D(, ssse3); -FUN_CONV_2D(avg_, ssse3); -#if CONFIG_LOOP_RESTORATION -FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3); -#endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm index b946010d3..c88fc9ffb 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm @@ -179,6 +179,8 @@ movq [rdi + %2], xmm0 %endm +SECTION .text + ;void aom_filter_block1d4_v8_sse2 ;( ; unsigned char *src_ptr, @@ -368,166 +370,6 @@ sym(aom_filter_block1d16_v8_sse2): pop rbp ret -global sym(aom_filter_block1d4_v8_avg_sse2) PRIVATE -sym(aom_filter_block1d4_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movd xmm0, [rsi] ;load src: row 0 - movd xmm1, [rsi + rax] ;1 - movd xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movd xmm7, [rsi + rdx * 2] ;7 - movd xmm2, [rsi + rax] ;2 - movd xmm3, [rsi + rax * 2] ;3 - movd xmm4, [rsi + rdx] ;4 - movd xmm5, [rsi + rax * 4] ;5 - - APPLY_FILTER_4 1 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_v8_avg_sse2) PRIVATE -sym(aom_filter_block1d8_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 1, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_v8_avg_sse2) PRIVATE -sym(aom_filter_block1d16_v8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 1, 0 - sub rsi, rax - - LOAD_VERT_8 8 - APPLY_FILTER_8 1, 8 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - ;void aom_filter_block1d4_h8_sse2 ;( ; unsigned char *src_ptr, @@ -771,220 +613,3 @@ sym(aom_filter_block1d16_h8_sse2): UNSHADOW_ARGS pop rbp ret - -global sym(aom_filter_block1d4_h8_avg_sse2) PRIVATE -sym(aom_filter_block1d4_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm3, 3 - psrldq xmm5, 5 - psrldq xmm4, 4 - - APPLY_FILTER_4 1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_h8_avg_sse2) PRIVATE -sym(aom_filter_block1d8_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 1, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_h8_avg_sse2) PRIVATE -sym(aom_filter_block1d16_h8_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 1, 0 - - movdqu xmm0, [rsi + 5] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 1, 8 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm index 8688fb544..3ca7921b6 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm @@ -375,17 +375,8 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ INIT_XMM ssse3 SUBPIX_HFILTER16 h8 -SUBPIX_HFILTER16 h8_avg SUBPIX_HFILTER8 h8 -SUBPIX_HFILTER8 h8_avg SUBPIX_HFILTER4 h8 -SUBPIX_HFILTER4 h8_avg - -%if CONFIG_LOOP_RESTORATION -SUBPIX_HFILTER16 h8_add_src -SUBPIX_HFILTER8 h8_add_src -SUBPIX_HFILTER4 h8_add_src -%endif ;------------------------------------------------------------------------------- @@ -875,15 +866,5 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ INIT_XMM ssse3 SUBPIX_VFILTER16 v8 -SUBPIX_VFILTER16 v8_avg SUBPIX_VFILTER v8, 8 -SUBPIX_VFILTER v8_avg, 8 SUBPIX_VFILTER v8, 4 -SUBPIX_VFILTER v8_avg, 4 - -%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \ - CONFIG_LOOP_RESTORATION -SUBPIX_VFILTER16 v8_add_src -SUBPIX_VFILTER v8_add_src, 8 -SUBPIX_VFILTER v8_add_src, 4 -%endif diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm index 8f025a8be..d0b4b2839 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm @@ -134,6 +134,8 @@ dec rcx %endm +SECTION .text + global sym(aom_filter_block1d4_v2_sse2) PRIVATE sym(aom_filter_block1d4_v2_sse2): push rbp @@ -212,84 +214,6 @@ sym(aom_filter_block1d16_v2_sse2): pop rbp ret -global sym(aom_filter_block1d4_v2_avg_sse2) PRIVATE -sym(aom_filter_block1d4_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_v2_avg_sse2) PRIVATE -sym(aom_filter_block1d8_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_v2_avg_sse2) PRIVATE -sym(aom_filter_block1d16_v2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - global sym(aom_filter_block1d4_h2_sse2) PRIVATE sym(aom_filter_block1d4_h2_sse2): push rbp @@ -369,83 +293,3 @@ sym(aom_filter_block1d16_h2_sse2): UNSHADOW_ARGS pop rbp ret - -global sym(aom_filter_block1d4_h2_avg_sse2) PRIVATE -sym(aom_filter_block1d4_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_h2_avg_sse2) PRIVATE -sym(aom_filter_block1d8_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_h2_avg_sse2) PRIVATE -sym(aom_filter_block1d16_h2_avg_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm index b9b2da0be..59edc49a9 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm @@ -108,6 +108,8 @@ dec rcx %endm +SECTION .text + global sym(aom_filter_block1d4_v2_ssse3) PRIVATE sym(aom_filter_block1d4_v2_ssse3): push rbp @@ -185,83 +187,6 @@ sym(aom_filter_block1d16_v2_ssse3): pop rbp ret -global sym(aom_filter_block1d4_v2_avg_ssse3) PRIVATE -sym(aom_filter_block1d4_v2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_v2_avg_ssse3) PRIVATE -sym(aom_filter_block1d8_v2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_v2_avg_ssse3) PRIVATE -sym(aom_filter_block1d16_v2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - global sym(aom_filter_block1d4_h2_ssse3) PRIVATE sym(aom_filter_block1d4_h2_ssse3): push rbp @@ -340,82 +265,3 @@ sym(aom_filter_block1d16_h2_ssse3): UNSHADOW_ARGS pop rbp ret - -global sym(aom_filter_block1d4_h2_avg_ssse3) PRIVATE -sym(aom_filter_block1d4_h2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d8_h2_avg_ssse3) PRIVATE -sym(aom_filter_block1d8_h2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -global sym(aom_filter_block1d16_h2_avg_ssse3) PRIVATE -sym(aom_filter_block1d16_h2_avg_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - - APPLY_FILTER_16 1 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c deleted file mode 100644 index 1a6457402..000000000 --- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "aom_dsp/x86/synonyms.h" - -#include "./aom_dsp_rtcd.h" -#include "aom_ports/mem.h" - -void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, - int *min, int *max) { - __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; - u0 = _mm_setzero_si128(); - // Row 0 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff0 = _mm_max_epi16(diff, negdiff); - // Row 1 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(absdiff0, absdiff); - minabsdiff = _mm_min_epi16(absdiff0, absdiff); - // Row 2 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 3 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 4 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 5 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 6 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 7 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); - *max = _mm_extract_epi16(maxabsdiff, 0); - - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); - *min = _mm_extract_epi16(minabsdiff, 0); -} - -static void hadamard_col8_sse2(__m128i *in, int iter) { - __m128i a0 = in[0]; - __m128i a1 = in[1]; - __m128i a2 = in[2]; - __m128i a3 = in[3]; - __m128i a4 = in[4]; - __m128i a5 = in[5]; - __m128i a6 = in[6]; - __m128i a7 = in[7]; - - __m128i b0 = _mm_add_epi16(a0, a1); - __m128i b1 = _mm_sub_epi16(a0, a1); - __m128i b2 = _mm_add_epi16(a2, a3); - __m128i b3 = _mm_sub_epi16(a2, a3); - __m128i b4 = _mm_add_epi16(a4, a5); - __m128i b5 = _mm_sub_epi16(a4, a5); - __m128i b6 = _mm_add_epi16(a6, a7); - __m128i b7 = _mm_sub_epi16(a6, a7); - - a0 = _mm_add_epi16(b0, b2); - a1 = _mm_add_epi16(b1, b3); - a2 = _mm_sub_epi16(b0, b2); - a3 = _mm_sub_epi16(b1, b3); - a4 = _mm_add_epi16(b4, b6); - a5 = _mm_add_epi16(b5, b7); - a6 = _mm_sub_epi16(b4, b6); - a7 = _mm_sub_epi16(b5, b7); - - if (iter == 0) { - b0 = _mm_add_epi16(a0, a4); - b7 = _mm_add_epi16(a1, a5); - b3 = _mm_add_epi16(a2, a6); - b4 = _mm_add_epi16(a3, a7); - b2 = _mm_sub_epi16(a0, a4); - b6 = _mm_sub_epi16(a1, a5); - b1 = _mm_sub_epi16(a2, a6); - b5 = _mm_sub_epi16(a3, a7); - - a0 = _mm_unpacklo_epi16(b0, b1); - a1 = _mm_unpacklo_epi16(b2, b3); - a2 = _mm_unpackhi_epi16(b0, b1); - a3 = _mm_unpackhi_epi16(b2, b3); - a4 = _mm_unpacklo_epi16(b4, b5); - a5 = _mm_unpacklo_epi16(b6, b7); - a6 = _mm_unpackhi_epi16(b4, b5); - a7 = _mm_unpackhi_epi16(b6, b7); - - b0 = _mm_unpacklo_epi32(a0, a1); - b1 = _mm_unpacklo_epi32(a4, a5); - b2 = _mm_unpackhi_epi32(a0, a1); - b3 = _mm_unpackhi_epi32(a4, a5); - b4 = _mm_unpacklo_epi32(a2, a3); - b5 = _mm_unpacklo_epi32(a6, a7); - b6 = _mm_unpackhi_epi32(a2, a3); - b7 = _mm_unpackhi_epi32(a6, a7); - - in[0] = _mm_unpacklo_epi64(b0, b1); - in[1] = _mm_unpackhi_epi64(b0, b1); - in[2] = _mm_unpacklo_epi64(b2, b3); - in[3] = _mm_unpackhi_epi64(b2, b3); - in[4] = _mm_unpacklo_epi64(b4, b5); - in[5] = _mm_unpackhi_epi64(b4, b5); - in[6] = _mm_unpacklo_epi64(b6, b7); - in[7] = _mm_unpackhi_epi64(b6, b7); - } else { - in[0] = _mm_add_epi16(a0, a4); - in[7] = _mm_add_epi16(a1, a5); - in[3] = _mm_add_epi16(a2, a6); - in[4] = _mm_add_epi16(a3, a7); - in[2] = _mm_sub_epi16(a0, a4); - in[6] = _mm_sub_epi16(a1, a5); - in[1] = _mm_sub_epi16(a2, a6); - in[5] = _mm_sub_epi16(a3, a7); - } -} - -void aom_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - __m128i src[8]; - src[0] = _mm_load_si128((const __m128i *)src_diff); - src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - - hadamard_col8_sse2(src, 0); - hadamard_col8_sse2(src, 1); - - _mm_store_si128((__m128i *)coeff, src[0]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[1]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[2]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[3]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[4]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[5]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[6]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[7]); -} - -void aom_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - int idx; - for (idx = 0; idx < 4; ++idx) { - int16_t const *src_ptr = - src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; - aom_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); - } - - for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); - __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); - __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); - __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); - - __m128i b0 = _mm_add_epi16(coeff0, coeff1); - __m128i b1 = _mm_sub_epi16(coeff0, coeff1); - __m128i b2 = _mm_add_epi16(coeff2, coeff3); - __m128i b3 = _mm_sub_epi16(coeff2, coeff3); - - b0 = _mm_srai_epi16(b0, 1); - b1 = _mm_srai_epi16(b1, 1); - b2 = _mm_srai_epi16(b2, 1); - b3 = _mm_srai_epi16(b3, 1); - - coeff0 = _mm_add_epi16(b0, b2); - coeff1 = _mm_add_epi16(b1, b3); - _mm_store_si128((__m128i *)coeff, coeff0); - _mm_store_si128((__m128i *)(coeff + 64), coeff1); - - coeff2 = _mm_sub_epi16(b0, b2); - coeff3 = _mm_sub_epi16(b1, b3); - _mm_store_si128((__m128i *)(coeff + 128), coeff2); - _mm_store_si128((__m128i *)(coeff + 192), coeff3); - - coeff += 8; - } -} - -int aom_satd_sse2(const int16_t *coeff, int length) { - int i; - const __m128i zero = _mm_setzero_si128(); - __m128i accum = zero; - - for (i = 0; i < length; i += 8) { - const __m128i src_line = _mm_load_si128((const __m128i *)coeff); - const __m128i inv = _mm_sub_epi16(zero, src_line); - const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) - const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); - const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); - const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); - accum = _mm_add_epi32(accum, sum); - coeff += 8; - } - - { // cascading summation of accum - __m128i hi = _mm_srli_si128(accum, 8); - accum = _mm_add_epi32(accum, hi); - hi = _mm_srli_epi64(accum, 32); - accum = _mm_add_epi32(accum, hi); - } - - return _mm_cvtsi128_si32(accum); -} - -void aom_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, int ref_stride, - int height) { - int idx; - __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_loadu_si128((const __m128i *)ref); - __m128i s0 = _mm_unpacklo_epi8(src_line, zero); - __m128i s1 = _mm_unpackhi_epi8(src_line, zero); - __m128i t0, t1; - int height_1 = height - 1; - ref += ref_stride; - - for (idx = 1; idx < height_1; idx += 2) { - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - ref += ref_stride; - - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - ref += ref_stride; - } - - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - - if (height == 64) { - s0 = _mm_srai_epi16(s0, 5); - s1 = _mm_srai_epi16(s1, 5); - } else if (height == 32) { - s0 = _mm_srai_epi16(s0, 4); - s1 = _mm_srai_epi16(s1, 4); - } else { - s0 = _mm_srai_epi16(s0, 3); - s1 = _mm_srai_epi16(s1, 3); - } - - _mm_storeu_si128((__m128i *)hbuf, s0); - hbuf += 8; - _mm_storeu_si128((__m128i *)hbuf, s1); -} - -int16_t aom_int_pro_col_sse2(uint8_t const *ref, int width) { - __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_load_si128((const __m128i *)ref); - __m128i s0 = _mm_sad_epu8(src_line, zero); - __m128i s1; - int i; - - for (i = 16; i < width; i += 16) { - ref += 16; - src_line = _mm_load_si128((const __m128i *)ref); - s1 = _mm_sad_epu8(src_line, zero); - s0 = _mm_adds_epu16(s0, s1); - } - - s1 = _mm_srli_si128(s0, 8); - s0 = _mm_adds_epu16(s0, s1); - - return _mm_extract_epi16(s0, 0); -} - -int aom_vector_var_sse2(int16_t const *ref, int16_t const *src, int bwl) { - int idx; - int width = 4 << bwl; - int16_t mean; - __m128i v0 = _mm_loadu_si128((const __m128i *)ref); - __m128i v1 = _mm_load_si128((const __m128i *)src); - __m128i diff = _mm_subs_epi16(v0, v1); - __m128i sum = diff; - __m128i sse = _mm_madd_epi16(diff, diff); - - ref += 8; - src += 8; - - for (idx = 8; idx < width; idx += 8) { - v0 = _mm_loadu_si128((const __m128i *)ref); - v1 = _mm_load_si128((const __m128i *)src); - diff = _mm_subs_epi16(v0, v1); - - sum = _mm_add_epi16(sum, diff); - v0 = _mm_madd_epi16(diff, diff); - sse = _mm_add_epi32(sse, v0); - - ref += 8; - src += 8; - } - - v0 = _mm_srli_si128(sum, 8); - sum = _mm_add_epi16(sum, v0); - v0 = _mm_srli_epi64(sum, 32); - sum = _mm_add_epi16(sum, v0); - v0 = _mm_srli_epi32(sum, 16); - sum = _mm_add_epi16(sum, v0); - - v1 = _mm_srli_si128(sse, 8); - sse = _mm_add_epi32(sse, v1); - v1 = _mm_srli_epi64(sse, 32); - sse = _mm_add_epi32(sse, v1); - - mean = _mm_extract_epi16(sum, 0); - - return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); -} diff --git a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm deleted file mode 100644 index b2d150296..000000000 --- a/third_party/aom/aom_dsp/x86/avg_ssse3_x86_64.asm +++ /dev/null @@ -1,124 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%define private_prefix aom - -%include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the hadamard transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. - -SECTION .text - -%if ARCH_X86_64 -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -%macro HMD8_1D 0 - psubw m8, m0, m1 - psubw m9, m2, m3 - paddw m0, m1 - paddw m2, m3 - SWAP 1, 8 - SWAP 3, 9 - psubw m8, m4, m5 - psubw m9, m6, m7 - paddw m4, m5 - paddw m6, m7 - SWAP 5, 8 - SWAP 7, 9 - - psubw m8, m0, m2 - psubw m9, m1, m3 - paddw m0, m2 - paddw m1, m3 - SWAP 2, 8 - SWAP 3, 9 - psubw m8, m4, m6 - psubw m9, m5, m7 - paddw m4, m6 - paddw m5, m7 - SWAP 6, 8 - SWAP 7, 9 - - psubw m8, m0, m4 - psubw m9, m1, m5 - paddw m0, m4 - paddw m1, m5 - SWAP 4, 8 - SWAP 5, 9 - psubw m8, m2, m6 - psubw m9, m3, m7 - paddw m2, m6 - paddw m3, m7 - SWAP 6, 8 - SWAP 7, 9 -%endmacro - -INIT_XMM ssse3 -cglobal hadamard_8x8, 3, 5, 10, input, stride, output - lea r3, [2 * strideq] - lea r4, [4 * strideq] - - mova m0, [inputq] - mova m1, [inputq + r3] - lea inputq, [inputq + r4] - mova m2, [inputq] - mova m3, [inputq + r3] - lea inputq, [inputq + r4] - mova m4, [inputq] - mova m5, [inputq + r3] - lea inputq, [inputq + r4] - mova m6, [inputq] - mova m7, [inputq + r3] - - HMD8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - HMD8_1D - - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 - - RET -%endif diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c index e916e4ff9..4f5e3f8c1 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c @@ -11,7 +11,7 @@ #include "aom/aom_integer.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" // To start out, just dispatch to the function using the 2D mask and // pass mask stride as 0. This can be improved upon if necessary. @@ -19,18 +19,16 @@ void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, 0, h, w, 0, 0); + src1_stride, mask, 0, w, h, 0, 0); } -#if CONFIG_HIGHBITDEPTH void aom_highbd_blend_a64_hmask_sse4_1( uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int h, int w, int bd) { + const uint8_t *mask, int w, int h, int bd) { aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride, - src1_8, src1_stride, mask, 0, h, w, 0, 0, + src1_8, src1_stride, mask, 0, w, h, 0, 0, bd); } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c index 68d74e517..49c20b467 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c @@ -21,7 +21,7 @@ #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/blend_sse4.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" ////////////////////////////////////////////////////////////////////////////// // No sub-sampling @@ -31,7 +31,7 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -58,7 +58,7 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, - int h, int w) { + int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -84,7 +84,7 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, static void blend_a64_mask_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { @@ -119,7 +119,7 @@ static void blend_a64_mask_w16n_sse4_1( static void blend_a64_mask_sx_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -149,7 +149,7 @@ static void blend_a64_mask_sx_w4_sse4_1( static void blend_a64_mask_sx_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -179,7 +179,7 @@ static void blend_a64_mask_sx_w8_sse4_1( static void blend_a64_mask_sx_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -219,7 +219,7 @@ static void blend_a64_mask_sx_w16n_sse4_1( static void blend_a64_mask_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -248,7 +248,7 @@ static void blend_a64_mask_sy_w4_sse4_1( static void blend_a64_mask_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -277,7 +277,7 @@ static void blend_a64_mask_sy_w8_sse4_1( static void blend_a64_mask_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zero = _mm_setzero_si128(); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -315,7 +315,7 @@ static void blend_a64_mask_sy_w16n_sse4_1( static void blend_a64_mask_sx_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -350,7 +350,7 @@ static void blend_a64_mask_sx_sy_w4_sse4_1( static void blend_a64_mask_sx_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -385,7 +385,7 @@ static void blend_a64_mask_sx_sy_w8_sse4_1( static void blend_a64_mask_sx_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -435,12 +435,12 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, - int w, int suby, int subx) { + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subx, int suby) { typedef void (*blend_fn)( uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w); + const uint8_t *mask, uint32_t mask_stride, int w, int h); // Dimensions are: width_index X subx X suby static const blend_fn blend[3][2][2] = { @@ -465,15 +465,14 @@ void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, h, w, suby, subx); + mask, mask_stride, w, h, subx, suby); } else { blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0, src0_stride, src1, src1_stride, - mask, mask_stride, h, w); + mask, mask_stride, w, h); } } -#if CONFIG_HIGHBITDEPTH ////////////////////////////////////////////////////////////////////////////// // No sub-sampling ////////////////////////////////////////////////////////////////////////////// @@ -503,7 +502,7 @@ static INLINE void blend_a64_mask_bn_w4_sse4_1( static void blend_a64_mask_b10_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b10); @@ -512,7 +511,7 @@ static void blend_a64_mask_b10_w4_sse4_1( static void blend_a64_mask_b12_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, blend_4_b12); @@ -521,7 +520,7 @@ static void blend_a64_mask_b12_w4_sse4_1( static INLINE void blend_a64_mask_bn_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, + const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -546,18 +545,18 @@ static INLINE void blend_a64_mask_bn_w8n_sse4_1( static void blend_a64_mask_b10_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b12); } @@ -594,7 +593,7 @@ static INLINE void blend_a64_mask_bn_sx_w4_sse4_1( static void blend_a64_mask_b10_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -604,7 +603,7 @@ static void blend_a64_mask_b10_sx_w4_sse4_1( static void blend_a64_mask_b12_sx_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -614,7 +613,7 @@ static void blend_a64_mask_b12_sx_w4_sse4_1( static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, + const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); @@ -643,18 +642,18 @@ static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1( static void blend_a64_mask_b10_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_sx_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b12); } @@ -690,7 +689,7 @@ static INLINE void blend_a64_mask_bn_sy_w4_sse4_1( static void blend_a64_mask_b10_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -700,7 +699,7 @@ static void blend_a64_mask_b10_sy_w4_sse4_1( static void blend_a64_mask_b12_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -710,7 +709,7 @@ static void blend_a64_mask_b12_sy_w4_sse4_1( static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, + const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); @@ -738,18 +737,18 @@ static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1( static void blend_a64_mask_b10_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b12); } @@ -791,7 +790,7 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1( static void blend_a64_mask_b10_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -801,7 +800,7 @@ static void blend_a64_mask_b10_sx_sy_w4_sse4_1( static void blend_a64_mask_b12_sx_sy_w4_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { (void)w; blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, mask_stride, h, @@ -811,7 +810,7 @@ static void blend_a64_mask_b12_sx_sy_w4_sse4_1( static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w, + const uint8_t *mask, uint32_t mask_stride, int w, int h, blend_unit_fn blend) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); @@ -845,18 +844,18 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1( static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b10); } static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w) { + const uint8_t *mask, uint32_t mask_stride, int w, int h) { blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, mask_stride, h, w, + src1_stride, mask, mask_stride, w, h, blend_8_b12); } @@ -869,12 +868,12 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, - uint32_t mask_stride, int h, int w, - int suby, int subx, int bd) { + uint32_t mask_stride, int w, int h, + int subx, int suby, int bd) { typedef void (*blend_fn)( uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, uint32_t mask_stride, int h, int w); + const uint8_t *mask, uint32_t mask_stride, int w, int h); // Dimensions are: bd_index X width_index X subx X suby static const blend_fn blend[2][2][2][2] = { @@ -909,8 +908,8 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, assert(bd == 8 || bd == 10 || bd == 12); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, - src1_stride, mask, mask_stride, h, w, suby, - subx, bd); + src1_stride, mask, mask_stride, w, h, subx, + suby, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); @@ -918,7 +917,113 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0]( dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, - mask_stride, h, w); + mask_stride, w, h); + } +} + +static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0, + const CONV_BUF_TYPE *src1, + const __m128i *m, + const __m128i *v_round_offset, + const __m128i *v_maxval, int round_bits) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadl_64(src0); + const __m128i s1 = xx_loadl_64(src1); + const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); + const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); + const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); + const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS); + const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset); + const __m128i res_d = xx_roundn_epi32(res_c, round_bits); + const __m128i res_e = _mm_packs_epi32(res_d, res_d); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + xx_storel_32(dst, res); +} + +void aom_lowbd_blend_a64_d16_mask_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int round_offset = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i v_ro_a = xx_loadl_32(&round_offset); + const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0); + const __m128i one_w = _mm_set1_epi16(1); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + + if (subw == 0 && subh == 0) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 4) { + const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]); + const __m128i m = _mm_cvtepu8_epi16(m0); + + blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], + &src1[i * src1_stride + j], &m, &v_round_offset, + &v_maxval, round_bits); + } + } + } else if (subw == 1 && subh == 1) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 4) { + const __m128i m_i0 = + xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]); + const __m128i m_i1 = + xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b); + const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], + &src1[i * src1_stride + j], &m, &v_round_offset, + &v_maxval, round_bits); + } + } + } else if (subw == 1 && subh == 0) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 4) { + const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w); + const __m128i m = _mm_srli_epi16(m_ac_1, 1); + + blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], + &src1[i * src1_stride + j], &m, &v_round_offset, + &v_maxval, round_bits); + } + } + } else { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 4) { + const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]); + const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]); + const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1); + const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b); + const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w); + const __m128i m = _mm_srli_epi16(m_ac_1, 1); + + blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], + &src1[i * src1_stride + j], &m, &v_round_offset, + &v_maxval, round_bits); + } + } } } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c index 9dabe5b79..59506bdfe 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -21,7 +21,7 @@ #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/blend_sse4.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" ////////////////////////////////////////////////////////////////////////////// // Implementation - No sub-sampling @@ -30,7 +30,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -55,7 +55,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); (void)w; @@ -82,7 +82,7 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { @@ -112,11 +112,11 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w); + const uint8_t *mask, int w, int h); // Dimension: width_index static const blend_fn blend[9] = { @@ -139,11 +139,10 @@ void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); - blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, - w); + blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, + h); } -#if CONFIG_HIGHBITDEPTH ////////////////////////////////////////////////////////////////////////////// // Implementation - No sub-sampling ////////////////////////////////////////////////////////////////////////////// @@ -174,7 +173,7 @@ static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { (void)w; blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, blend_4_b10); @@ -185,7 +184,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { (void)w; blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, h, blend_4_b12); @@ -194,7 +193,7 @@ static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, static INLINE void blend_a64_vmask_bn_w8n_sse4_1( uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w, blend_unit_fn blend) { + const uint8_t *mask, int w, int h, blend_unit_fn blend) { const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); do { @@ -218,9 +217,9 @@ static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, w, blend_8_b10); + src1_stride, mask, w, h, blend_8_b10); } static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, @@ -228,9 +227,9 @@ static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w) { + const uint8_t *mask, int w, int h) { blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, w, blend_8_b12); + src1_stride, mask, w, h, blend_8_b12); } ////////////////////////////////////////////////////////////////////////////// @@ -240,11 +239,11 @@ static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, void aom_highbd_blend_a64_vmask_sse4_1( uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, - const uint8_t *mask, int h, int w, int bd) { + const uint8_t *mask, int w, int h, int bd) { typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, - const uint8_t *mask, int h, int w); + const uint8_t *mask, int w, int h); // Dimensions are: bd_index X width_index static const blend_fn blend[2][2] = { @@ -272,14 +271,13 @@ void aom_highbd_blend_a64_vmask_sse4_1( if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, - src1_stride, mask, h, w, bd); + src1_stride, mask, w, h, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, - src1_stride, mask, h, w); + src1_stride, mask, w, h); } } -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h index daa2b2b3a..4880438bc 100644 --- a/third_party/aom/aom_dsp/x86/blend_sse4.h +++ b/third_party/aom/aom_dsp/x86/blend_sse4.h @@ -53,7 +53,6 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, return v_res_w; } -#if CONFIG_HIGHBITDEPTH typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, const __m128i v_m0_w, const __m128i v_m1_w); @@ -141,6 +140,5 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, return v_res_w; } -#endif // CONFIG_HIGHBITDEPTH #endif // AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h index 5f9596a74..3f46420dd 100644 --- a/third_party/aom/aom_dsp/x86/common_avx2.h +++ b/third_party/aom/aom_dsp/x86/common_avx2.h @@ -14,7 +14,7 @@ #include -#include "./aom_config.h" +#include "config/aom_config.h" // Note: in and out could have the same value static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h index 8641164db..36fb1963a 100644 --- a/third_party/aom/aom_dsp/x86/convolve.h +++ b/third_party/aom/aom_dsp/x86/convolve.h @@ -13,7 +13,8 @@ #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" #include "aom_ports/mem.h" #include "aom_dsp/aom_convolve.h" @@ -84,102 +85,6 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, } \ } -#define FUN_CONV_2D(avg, opt) \ - void aom_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ - assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ - assert(w <= MAX_SB_SIZE); \ - assert(h <= MAX_SB_SIZE); \ - assert(x_step_q4 == 16); \ - assert(y_step_q4 == 16); \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] || \ - filter_y[1] || filter_y[2]) { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ - aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, \ - MAX_SB_SIZE, filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h + 7); \ - aom_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ - dst, dst_stride, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h); \ - } else { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ - aom_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, \ - h + 1); \ - aom_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h); \ - } \ - } - -#if CONFIG_LOOP_RESTORATION -// convolve_add_src is only used by the Wiener filter, which will never -// end up calling the bilinear functions (it uses a symmetric filter, so -// the possible numbers of taps are 1,3,5,7) -#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \ - opt) \ - void aom_convolve8_##name##_##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - (void)filter_x; \ - (void)x_step_q4; \ - (void)filter_y; \ - (void)y_step_q4; \ - assert((-128 <= filter[3]) && (filter[3] <= 127)); \ - assert(step_q4 == 16); \ - while (w >= 16) { \ - aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \ - dst_stride, h, filter); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - if (w) { \ - aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h); \ - } \ - } - -#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt) \ - void aom_convolve8_##type##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h) { \ - DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ - assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \ - assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \ - assert(w <= MAX_SB_SIZE); \ - assert(h <= MAX_SB_SIZE); \ - assert(x_step_q4 == 16); \ - assert(y_step_q4 == 16); \ - aom_convolve8_##htype##horiz_##opt( \ - src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x, \ - x_step_q4, filter_y, y_step_q4, w, h + 7); \ - aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \ - dst, dst_stride, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h); \ - } -#endif - -#if CONFIG_HIGHBITDEPTH typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, const ptrdiff_t src_pitch, uint16_t *output_ptr, @@ -248,41 +153,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, } \ } -#define HIGH_FUN_CONV_2D(avg, opt) \ - void aom_highbd_convolve8_##avg##opt( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - assert(w <= MAX_SB_SIZE); \ - assert(h <= MAX_SB_SIZE); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \ - filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, \ - fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \ - aom_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ - CONVERT_TO_BYTEPTR(fdata2), \ - MAX_SB_SIZE, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h + 7, bd); \ - aom_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst, \ - dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ - } else { \ - DECLARE_ALIGNED(16, uint16_t, \ - fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \ - aom_highbd_convolve8_horiz_##opt( \ - src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd); \ - aom_highbd_convolve8_##avg##vert_##opt( \ - CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \ - } \ - } else { \ - aom_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ - } \ - } -#endif // CONFIG_HIGHBITDEPTH - #endif // AOM_DSP_X86_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h new file mode 100644 index 000000000..7790baf2e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_ +#define AOM_DSP_X86_CONVOLVE_AVX2_H_ + +// filters for 16 +DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, + 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +}; + +DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, + 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +}; + +static INLINE void prepare_coeffs_lowbd( + const InterpFilterParams *const filter_params, const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *const filter = av1_get_interp_filter_subpel_kernel( + *filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); + + // right shift all filter co-efficients by 1 to reduce the bits required. + // This extra right shift will be taken care of at the end while rounding + // the result. + // Since all filter co-efficients are even, this change will not affect the + // end result + assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)), + _mm_set1_epi16(0xffff))); + + const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u)); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u)); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u)); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu)); +} + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m256i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + *filter_params, subpel_q4 & SUBPEL_MASK); + + const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); + const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff); +} + +static INLINE __m256i convolve_lowbd(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]); + const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]); + const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]); + + // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), + _mm256_add_epi16(res_23, res_67)); + + return res; +} + +static INLINE __m256i convolve(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); + const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]); + const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]); + const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]); + + const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), + _mm256_add_epi32(res_2, res_3)); + + return res; +} + +static INLINE __m256i convolve_lowbd_x(const __m256i data, + const __m256i *const coeffs, + const __m256i *const filt) { + __m256i s[4]; + + s[0] = _mm256_shuffle_epi8(data, filt[0]); + s[1] = _mm256_shuffle_epi8(data, filt[1]); + s[2] = _mm256_shuffle_epi8(data, filt[2]); + s[3] = _mm256_shuffle_epi8(data, filt[3]); + + return convolve_lowbd(s, coeffs); +} + +static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, + const __m256i *const res, + const int do_average) { + __m256i d; + if (do_average) { + d = _mm256_load_si256((__m256i *)dst); + d = _mm256_add_epi32(d, *res); + d = _mm256_srai_epi32(d, 1); + } else { + d = *res; + } + _mm256_store_si256((__m256i *)dst, d); +} + +static INLINE __m256i comp_avg(const __m256i *const data_ref_0, + const __m256i *const res_unsigned, + const __m256i *const wt, + const int use_jnt_comp_avg) { + __m256i res; + if (use_jnt_comp_avg) { + const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); + const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); + + const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt); + const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt); + + const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + + res = _mm256_packs_epi32(res_lo, res_hi); + } else { + const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned); + res = _mm256_srai_epi16(wt_res, 1); + } + return res; +} + +static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned, + const __m256i *const offset_const, + const __m256i *const round_const, + const int round_shift) { + const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const); + const __m256i res_round = _mm256_srai_epi16( + _mm256_add_epi16(res_signed, *round_const), round_shift); + return res_round; +} + +static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0, + const __m256i *const res_unsigned, + const __m256i *const wt0, + const __m256i *const wt1, + const int use_jnt_comp_avg) { + __m256i res; + if (use_jnt_comp_avg) { + const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); + const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); + const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); + res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS); + } else { + const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned); + res = _mm256_srai_epi32(wt_res, 1); + } + return res; +} + +static INLINE __m256i highbd_convolve_rounding( + const __m256i *const res_unsigned, const __m256i *const offset_const, + const __m256i *const round_const, const int round_shift) { + const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const); + const __m256i res_round = _mm256_srai_epi32( + _mm256_add_epi32(res_signed, *round_const), round_shift); + + return res_round; +} + +#endif diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h new file mode 100644 index 000000000..e80c5872f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#define _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, + const int do_average) { + __m128i d; + if (do_average) { + d = _mm_load_si128((__m128i *)dst); + d = _mm_add_epi32(d, *res); + d = _mm_srai_epi32(d, 1); + } else { + d = *res; + } + _mm_store_si128((__m128i *)dst, d); +} + +#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h new file mode 100644 index 000000000..846fe7bb4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_CONVOLVE_SSE2_H_ +#define AOM_DSP_X86_CONVOLVE_SSE2_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, + const int subpel_q4, + __m128i *const coeffs /* [4] */) { + const int16_t *filter = av1_get_interp_filter_subpel_kernel( + *filter_params, subpel_q4 & SUBPEL_MASK); + const __m128i coeff = _mm_loadu_si128((__m128i *)filter); + + // coeffs 0 1 0 1 0 1 0 1 + coeffs[0] = _mm_shuffle_epi32(coeff, 0x00); + // coeffs 2 3 2 3 2 3 2 3 + coeffs[1] = _mm_shuffle_epi32(coeff, 0x55); + // coeffs 4 5 4 5 4 5 4 5 + coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa); + // coeffs 6 7 6 7 6 7 6 7 + coeffs[3] = _mm_shuffle_epi32(coeff, 0xff); +} + +static INLINE __m128i convolve(const __m128i *const s, + const __m128i *const coeffs) { + const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]); + const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]); + const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]); + const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]); + + const __m128i res = + _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3)); + + return res; +} + +static INLINE __m128i convolve_lo_x(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_lo_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i convolve_hi_y(const __m128i *const s, + const __m128i *const coeffs) { + __m128i ss[4]; + ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128()); + ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128()); + ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128()); + ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128()); + return convolve(ss, coeffs); +} + +static INLINE __m128i comp_avg(const __m128i *const data_ref_0, + const __m128i *const res_unsigned, + const __m128i *const wt, + const int use_jnt_comp_avg) { + __m128i res; + if (use_jnt_comp_avg) { + const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned); + const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned); + + const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt); + const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt); + + const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS); + const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS); + + res = _mm_packs_epi32(res_lo, res_hi); + } else { + const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned); + res = _mm_srai_epi16(wt_res, 1); + } + return res; +} + +static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned, + const __m128i *const offset_const, + const __m128i *const round_const, + const int round_shift) { + const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const); + const __m128i res_round = + _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift); + return res_round; +} + +static INLINE __m128i highbd_convolve_rounding_sse2( + const __m128i *const res_unsigned, const __m128i *const offset_const, + const __m128i *const round_const, const int round_shift) { + const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const); + const __m128i res_round = + _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift); + + return res_round; +} + +#endif diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h new file mode 100644 index 000000000..d48c25667 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_ +#define _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_ + +// Note: +// This header file should be put below any x86 intrinsics head file + +static INLINE void mult_add_store(CONV_BUF_TYPE *const dst, + const __m128i *const res, + const __m128i *const wt0, + const __m128i *const wt1, + const int do_average) { + __m128i d; + if (do_average) { + d = _mm_load_si128((__m128i *)dst); + d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1)); + d = _mm_srai_epi32(d, DIST_PRECISION_BITS); + } else { + d = *res; + } + _mm_store_si128((__m128i *)dst, d); +} + +static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, + const __m128i *const res_unsigned, + const __m128i *const wt0, + const __m128i *const wt1, + const int use_jnt_comp_avg) { + __m128i res; + if (use_jnt_comp_avg) { + const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0); + const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1); + + const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res); + res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS); + } else { + const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned); + res = _mm_srai_epi32(wt_res, 1); + } + return res; +} + +#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c new file mode 100644 index 000000000..54da02253 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fft_avx2.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +extern void aom_transpose_float_sse2(const float *A, float *B, int n); +extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output, + int n); + +// Generate the 1d forward transforms for float using _mm256 +GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); + +void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8); +} + +// Generate the 1d inverse transforms for float using _mm256 +GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); +GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps, + _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps, + _mm256_mul_ps); + +void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2, + aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8); +} + +void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_avx2, aom_ifft1d_16_avx2, + aom_transpose_float_sse2, 8); +} + +void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_avx2, aom_ifft1d_32_avx2, + aom_transpose_float_sse2, 8); +} diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c new file mode 100644 index 000000000..12bdc3e18 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/fft_sse2.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the +s * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/fft_common.h" + +static INLINE void transpose4x4(const float *A, float *B, const int lda, + const int ldb) { + __m128 row1 = _mm_load_ps(&A[0 * lda]); + __m128 row2 = _mm_load_ps(&A[1 * lda]); + __m128 row3 = _mm_load_ps(&A[2 * lda]); + __m128 row4 = _mm_load_ps(&A[3 * lda]); + _MM_TRANSPOSE4_PS(row1, row2, row3, row4); + _mm_store_ps(&B[0 * ldb], row1); + _mm_store_ps(&B[1 * ldb], row2); + _mm_store_ps(&B[2 * ldb], row3); + _mm_store_ps(&B[3 * ldb], row4); +} + +void aom_transpose_float_sse2(const float *A, float *B, int n) { + for (int y = 0; y < n; y += 4) { + for (int x = 0; x < n; x += 4) { + transpose4x4(A + y * n + x, B + x * n + y, n, n); + } + } +} + +void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) { + const int n2 = n / 2; + output[0] = packed[0]; + output[1] = 0; + output[2 * (n2 * n)] = packed[n2 * n]; + output[2 * (n2 * n) + 1] = 0; + + output[2 * n2] = packed[n2]; + output[2 * n2 + 1] = 0; + output[2 * (n2 * n + n2)] = packed[n2 * n + n2]; + output[2 * (n2 * n + n2) + 1] = 0; + + for (int c = 1; c < n2; ++c) { + output[2 * (0 * n + c)] = packed[c]; + output[2 * (0 * n + c) + 1] = packed[c + n2]; + output[2 * (n2 * n + c) + 0] = packed[n2 * n + c]; + output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2]; + } + for (int r = 1; r < n2; ++r) { + output[2 * (r * n + 0)] = packed[r * n]; + output[2 * (r * n + 0) + 1] = packed[(r + n2) * n]; + output[2 * (r * n + n2) + 0] = packed[r * n + n2]; + output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2]; + + for (int c = 1; c < AOMMIN(n2, 4); ++c) { + output[2 * (r * n + c)] = + packed[r * n + c] - packed[(r + n2) * n + c + n2]; + output[2 * (r * n + c) + 1] = + packed[(r + n2) * n + c] + packed[r * n + c + n2]; + } + + for (int c = 4; c < n2; c += 4) { + __m128 real1 = _mm_load_ps(packed + r * n + c); + __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2); + __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c); + __m128 imag2 = _mm_load_ps(packed + r * n + c + n2); + real1 = _mm_sub_ps(real1, real2); + imag1 = _mm_add_ps(imag1, imag2); + _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1)); + _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1)); + } + + int r2 = r + n2; + int r3 = n - r2; + output[2 * (r2 * n + 0)] = packed[r3 * n]; + output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n]; + output[2 * (r2 * n + n2)] = packed[r3 * n + n2]; + output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2]; + for (int c = 1; c < AOMMIN(4, n2); ++c) { + output[2 * (r2 * n + c)] = + packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2]; + output[2 * (r2 * n + c) + 1] = + -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2]; + } + for (int c = 4; c < n2; c += 4) { + __m128 real1 = _mm_load_ps(packed + r3 * n + c); + __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2); + __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c); + __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2); + real1 = _mm_add_ps(real1, real2); + imag1 = _mm_sub_ps(imag2, imag1); + _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1)); + _mm_store_ps(output + 2 * (r2 * n + c + 2), + _mm_unpackhi_ps(real1, imag1)); + } + } +} + +// Generate definitions for 1d transforms using float and __mm128 +GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps); +GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); + +void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) { + aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2, + aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4); +} + +// Generate definitions for 1d inverse transforms using float and mm128 +GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps); +GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); +GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps, + _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps); + +void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2, + aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4); +} + +void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2, + aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4); +} + +void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, + aom_fft1d_16_sse2, aom_ifft1d_16_sse2, + aom_transpose_float_sse2, 4); +} + +void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) { + aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, + aom_fft1d_32_sse2, aom_ifft1d_32_sse2, + aom_transpose_float_sse2, 4); +} diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c b/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c deleted file mode 100644 index b8ec08de7..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_dct32_8cols_sse2.c +++ /dev/null @@ -1,862 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include // SSE2 - -#include "aom_dsp/fwd_txfm.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -// Apply a 32-element IDCT to 8 columns. This does not do any transposition -// of its output - the caller is expected to do that. -// The input buffers are the top and bottom halves of an 8x32 block. -void fdct32_8col(__m128i *in0, __m128i *in1) { - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); - const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); - const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); - const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); - const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); - const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); - const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); - const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); - const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); - const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); - const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); - const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); - const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); - const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); - const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); - const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); - const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i step1[32]; - __m128i step2[32]; - __m128i step3[32]; - __m128i out[32]; - // Stage 1 - { - const __m128i *ina = in0; - const __m128i *inb = in1 + 15; - __m128i *step1a = &step1[0]; - __m128i *step1b = &step1[31]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - } - { - const __m128i *ina = in0 + 4; - const __m128i *inb = in1 + 11; - __m128i *step1a = &step1[4]; - __m128i *step1b = &step1[27]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - } - { - const __m128i *ina = in0 + 8; - const __m128i *inb = in1 + 7; - __m128i *step1a = &step1[8]; - __m128i *step1b = &step1[23]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - } - { - const __m128i *ina = in0 + 12; - const __m128i *inb = in1 + 3; - __m128i *step1a = &step1[12]; - __m128i *step1b = &step1[19]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - } - // Stage 2 - { - step2[0] = _mm_add_epi16(step1[0], step1[15]); - step2[1] = _mm_add_epi16(step1[1], step1[14]); - step2[2] = _mm_add_epi16(step1[2], step1[13]); - step2[3] = _mm_add_epi16(step1[3], step1[12]); - step2[4] = _mm_add_epi16(step1[4], step1[11]); - step2[5] = _mm_add_epi16(step1[5], step1[10]); - step2[6] = _mm_add_epi16(step1[6], step1[9]); - step2[7] = _mm_add_epi16(step1[7], step1[8]); - step2[8] = _mm_sub_epi16(step1[7], step1[8]); - step2[9] = _mm_sub_epi16(step1[6], step1[9]); - step2[10] = _mm_sub_epi16(step1[5], step1[10]); - step2[11] = _mm_sub_epi16(step1[4], step1[11]); - step2[12] = _mm_sub_epi16(step1[3], step1[12]); - step2[13] = _mm_sub_epi16(step1[2], step1[13]); - step2[14] = _mm_sub_epi16(step1[1], step1[14]); - step2[15] = _mm_sub_epi16(step1[0], step1[15]); - } - { - const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); - const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); - const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); - const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); - const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); - const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); - const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); - const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); - const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); - const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); - const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); - const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); - const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); - const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); - const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); - const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); - const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); - const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); - const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); - const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); - const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); - const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); - const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); - const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); - const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); - const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); - const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); - const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); - const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); - const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); - const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); - const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); - const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); - const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); - const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); - const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); - const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); - const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); - const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); - const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); - const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); - const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); - const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); - const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); - const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); - const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); - const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); - const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); - const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); - const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); - const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); - const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); - const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); - const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); - const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); - // Combine - step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); - step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); - step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); - step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); - step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); - step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); - step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); - step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); - } - // Stage 3 - { - step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); - step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]); - step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]); - step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]); - step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]); - step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]); - step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]); - step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]); - } - { - const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); - const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); - const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); - const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); - const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); - const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); - const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); - const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); - const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); - const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); - const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); - const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); - // Combine - step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); - step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); - step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); - step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); - } - { - step3[16] = _mm_add_epi16(step2[23], step1[16]); - step3[17] = _mm_add_epi16(step2[22], step1[17]); - step3[18] = _mm_add_epi16(step2[21], step1[18]); - step3[19] = _mm_add_epi16(step2[20], step1[19]); - step3[20] = _mm_sub_epi16(step1[19], step2[20]); - step3[21] = _mm_sub_epi16(step1[18], step2[21]); - step3[22] = _mm_sub_epi16(step1[17], step2[22]); - step3[23] = _mm_sub_epi16(step1[16], step2[23]); - step3[24] = _mm_sub_epi16(step1[31], step2[24]); - step3[25] = _mm_sub_epi16(step1[30], step2[25]); - step3[26] = _mm_sub_epi16(step1[29], step2[26]); - step3[27] = _mm_sub_epi16(step1[28], step2[27]); - step3[28] = _mm_add_epi16(step2[27], step1[28]); - step3[29] = _mm_add_epi16(step2[26], step1[29]); - step3[30] = _mm_add_epi16(step2[25], step1[30]); - step3[31] = _mm_add_epi16(step2[24], step1[31]); - } - - // Stage 4 - { - step1[0] = _mm_add_epi16(step3[3], step3[0]); - step1[1] = _mm_add_epi16(step3[2], step3[1]); - step1[2] = _mm_sub_epi16(step3[1], step3[2]); - step1[3] = _mm_sub_epi16(step3[0], step3[3]); - step1[8] = _mm_add_epi16(step3[11], step2[8]); - step1[9] = _mm_add_epi16(step3[10], step2[9]); - step1[10] = _mm_sub_epi16(step2[9], step3[10]); - step1[11] = _mm_sub_epi16(step2[8], step3[11]); - step1[12] = _mm_sub_epi16(step2[15], step3[12]); - step1[13] = _mm_sub_epi16(step2[14], step3[13]); - step1[14] = _mm_add_epi16(step3[13], step2[14]); - step1[15] = _mm_add_epi16(step3[12], step2[15]); - } - { - const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); - const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); - const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); - const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); - const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); - const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); - // Combine - step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); - step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); - } - { - const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); - const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); - const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); - const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); - const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); - const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); - const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); - const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); - const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); - const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); - const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); - const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); - const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); - const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); - const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); - const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); - const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); - const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); - const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); - const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); - const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); - const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); - const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); - const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); - // Combine - step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); - step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); - step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); - step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); - step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); - step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); - step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); - step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); - } - // Stage 5 - { - step2[4] = _mm_add_epi16(step1[5], step3[4]); - step2[5] = _mm_sub_epi16(step3[4], step1[5]); - step2[6] = _mm_sub_epi16(step3[7], step1[6]); - step2[7] = _mm_add_epi16(step1[6], step3[7]); - } - { - const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); - const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); - const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); - const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); - const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); - const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); - const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); - const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); - const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); - const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); - const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); - const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); - const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); - const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); - const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); - const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); - const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); - const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); - const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); - const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); - const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); - const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); - const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); - const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); - const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); - const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); - const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); - // Combine - out[0] = _mm_packs_epi32(out_00_6, out_00_7); - out[16] = _mm_packs_epi32(out_16_6, out_16_7); - out[8] = _mm_packs_epi32(out_08_6, out_08_7); - out[24] = _mm_packs_epi32(out_24_6, out_24_7); - } - { - const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); - const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); - const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); - const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); - const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); - const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); - const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); - const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); - const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); - const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); - const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); - const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); - const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); - const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); - const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); - const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); - const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); - const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); - const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); - const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); - const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); - const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); - const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); - const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); - const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); - const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); - const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); - // Combine - step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); - step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); - step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); - step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); - } - { - step2[16] = _mm_add_epi16(step1[19], step3[16]); - step2[17] = _mm_add_epi16(step1[18], step3[17]); - step2[18] = _mm_sub_epi16(step3[17], step1[18]); - step2[19] = _mm_sub_epi16(step3[16], step1[19]); - step2[20] = _mm_sub_epi16(step3[23], step1[20]); - step2[21] = _mm_sub_epi16(step3[22], step1[21]); - step2[22] = _mm_add_epi16(step1[21], step3[22]); - step2[23] = _mm_add_epi16(step1[20], step3[23]); - step2[24] = _mm_add_epi16(step1[27], step3[24]); - step2[25] = _mm_add_epi16(step1[26], step3[25]); - step2[26] = _mm_sub_epi16(step3[25], step1[26]); - step2[27] = _mm_sub_epi16(step3[24], step1[27]); - step2[28] = _mm_sub_epi16(step3[31], step1[28]); - step2[29] = _mm_sub_epi16(step3[30], step1[29]); - step2[30] = _mm_add_epi16(step1[29], step3[30]); - step2[31] = _mm_add_epi16(step1[28], step3[31]); - } - // Stage 6 - { - const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); - const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); - const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); - const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); - const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); - const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); - const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); - const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); - // dct_const_round_shift - const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); - const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); - const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); - const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); - const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); - const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); - const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); - const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); - const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); - const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); - const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); - const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); - const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); - const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); - const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); - const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); - // Combine - out[4] = _mm_packs_epi32(out_04_6, out_04_7); - out[20] = _mm_packs_epi32(out_20_6, out_20_7); - out[12] = _mm_packs_epi32(out_12_6, out_12_7); - out[28] = _mm_packs_epi32(out_28_6, out_28_7); - } - { - step3[8] = _mm_add_epi16(step2[9], step1[8]); - step3[9] = _mm_sub_epi16(step1[8], step2[9]); - step3[10] = _mm_sub_epi16(step1[11], step2[10]); - step3[11] = _mm_add_epi16(step2[10], step1[11]); - step3[12] = _mm_add_epi16(step2[13], step1[12]); - step3[13] = _mm_sub_epi16(step1[12], step2[13]); - step3[14] = _mm_sub_epi16(step1[15], step2[14]); - step3[15] = _mm_add_epi16(step2[14], step1[15]); - } - { - const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); - const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); - const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); - const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); - const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); - const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); - const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); - const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); - const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); - const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); - const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); - const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); - const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); - const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); - const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); - const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); - const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); - const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); - const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); - const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); - const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); - const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); - const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); - const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); - // dct_const_round_shift - const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); - const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); - const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); - const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); - const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); - const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); - const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); - const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); - const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); - const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); - const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); - const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); - const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); - const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); - const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); - const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); - const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); - const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); - const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); - const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); - const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); - const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); - const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); - const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); - const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); - const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); - const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); - const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); - const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); - const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); - const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); - const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); - // Combine - step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); - step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); - step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); - step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); - // Combine - step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); - step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); - step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); - step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); - } - // Stage 7 - { - const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); - const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); - const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); - const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); - const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); - const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); - const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); - const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); - const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); - const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); - const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); - const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); - const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); - const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); - const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); - const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); - const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); - const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); - const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); - const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); - const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); - const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); - const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); - const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); - // dct_const_round_shift - const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); - const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); - const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); - const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); - const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); - const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); - const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); - const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); - const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); - const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); - const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); - const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); - const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); - const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); - const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); - const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); - const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); - const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); - const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); - const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); - const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); - const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); - const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); - const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); - const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); - const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); - const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); - const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); - const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); - const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); - const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); - const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); - // Combine - out[2] = _mm_packs_epi32(out_02_6, out_02_7); - out[18] = _mm_packs_epi32(out_18_6, out_18_7); - out[10] = _mm_packs_epi32(out_10_6, out_10_7); - out[26] = _mm_packs_epi32(out_26_6, out_26_7); - out[6] = _mm_packs_epi32(out_06_6, out_06_7); - out[22] = _mm_packs_epi32(out_22_6, out_22_7); - out[14] = _mm_packs_epi32(out_14_6, out_14_7); - out[30] = _mm_packs_epi32(out_30_6, out_30_7); - } - { - step1[16] = _mm_add_epi16(step3[17], step2[16]); - step1[17] = _mm_sub_epi16(step2[16], step3[17]); - step1[18] = _mm_sub_epi16(step2[19], step3[18]); - step1[19] = _mm_add_epi16(step3[18], step2[19]); - step1[20] = _mm_add_epi16(step3[21], step2[20]); - step1[21] = _mm_sub_epi16(step2[20], step3[21]); - step1[22] = _mm_sub_epi16(step2[23], step3[22]); - step1[23] = _mm_add_epi16(step3[22], step2[23]); - step1[24] = _mm_add_epi16(step3[25], step2[24]); - step1[25] = _mm_sub_epi16(step2[24], step3[25]); - step1[26] = _mm_sub_epi16(step2[27], step3[26]); - step1[27] = _mm_add_epi16(step3[26], step2[27]); - step1[28] = _mm_add_epi16(step3[29], step2[28]); - step1[29] = _mm_sub_epi16(step2[28], step3[29]); - step1[30] = _mm_sub_epi16(step2[31], step3[30]); - step1[31] = _mm_add_epi16(step3[30], step2[31]); - } - // Final stage --- outputs indices are bit-reversed. - { - const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); - const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); - const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); - const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); - const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); - const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); - const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); - const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); - const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); - const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); - const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); - const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); - const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); - const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); - const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); - const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); - const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); - const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); - const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); - const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); - const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); - const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); - const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); - const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); - // dct_const_round_shift - const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); - const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); - const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); - const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); - const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); - const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); - const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); - const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); - const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); - const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); - const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); - const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); - const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); - const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); - const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); - const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); - const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); - const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); - const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); - const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); - const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); - const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); - const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); - const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); - const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); - const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); - const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); - const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); - const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); - const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); - const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); - const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); - // Combine - out[1] = _mm_packs_epi32(out_01_6, out_01_7); - out[17] = _mm_packs_epi32(out_17_6, out_17_7); - out[9] = _mm_packs_epi32(out_09_6, out_09_7); - out[25] = _mm_packs_epi32(out_25_6, out_25_7); - out[7] = _mm_packs_epi32(out_07_6, out_07_7); - out[23] = _mm_packs_epi32(out_23_6, out_23_7); - out[15] = _mm_packs_epi32(out_15_6, out_15_7); - out[31] = _mm_packs_epi32(out_31_6, out_31_7); - } - { - const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); - const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); - const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); - const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); - const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); - const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); - const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); - const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); - const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); - const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); - const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); - const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); - const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); - const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); - const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); - const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); - const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); - const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); - const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); - const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); - const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); - const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); - const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); - const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); - // dct_const_round_shift - const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); - const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); - const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); - const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); - const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); - const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); - const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); - const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); - const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); - const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); - const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); - const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); - const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); - const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); - const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); - const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); - const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); - const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); - const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); - const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); - const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); - const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); - const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); - const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); - const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); - const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); - const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); - const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); - const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); - const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); - const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); - const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); - // Combine - out[5] = _mm_packs_epi32(out_05_6, out_05_7); - out[21] = _mm_packs_epi32(out_21_6, out_21_7); - out[13] = _mm_packs_epi32(out_13_6, out_13_7); - out[29] = _mm_packs_epi32(out_29_6, out_29_7); - out[3] = _mm_packs_epi32(out_03_6, out_03_7); - out[19] = _mm_packs_epi32(out_19_6, out_19_7); - out[11] = _mm_packs_epi32(out_11_6, out_11_7); - out[27] = _mm_packs_epi32(out_27_6, out_27_7); - } - - // Output results - { - int j; - for (j = 0; j < 16; ++j) { - _mm_storeu_si128((__m128i *)(in0 + j), out[j]); - _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]); - } - } -} // NOLINT diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h deleted file mode 100644 index 216739581..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_avx2.h +++ /dev/null @@ -1,3022 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include // AVX2 - -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_intrin.h" -#include "aom_dsp/x86/txfm_common_avx2.h" - -#if FDCT32x32_HIGH_PRECISION -static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) { - __m256i buf0, buf1; - buf0 = _mm256_mul_epu32(a, b); - a = _mm256_srli_epi64(a, 32); - b = _mm256_srli_epi64(b, 32); - buf1 = _mm256_mul_epu32(a, b); - return _mm256_add_epi64(buf0, buf1); -} - -static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) { - __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); - __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); - return _mm256_unpacklo_epi64(buf0, buf1); -} -#endif - -#ifndef STORE_COEFF_FUNC -#define STORE_COEFF_FUNC -static void store_coeff(const __m256i *coeff, tran_low_t *curr, - tran_low_t *next) { - __m128i u = _mm256_castsi256_si128(*coeff); - storeu_output(&u, curr); - u = _mm256_extractf128_si256(*coeff, 1); - storeu_output(&u, next); -} -#endif - -void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org, - int stride) { - // Calculate pre-multiplied strides - const int str1 = stride; - const int str2 = 2 * stride; - const int str3 = 2 * stride + str1; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]); - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i k__cospi_p16_m16 = - pair256_set_epi16(+cospi_16_64, -cospi_16_64); - const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64, cospi_8_64); - const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64, cospi_20_64); - const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64, cospi_4_64); - const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i k__cospi_m12_m20 = - pair256_set_epi16(-cospi_12_64, -cospi_20_64); - const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64, cospi_2_64); - const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64, cospi_18_64); - const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64, cospi_10_64); - const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64, cospi_26_64); - const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64); - const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64); - const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64); - const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64); - const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64, cospi_1_64); - const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64, cospi_17_64); - const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64, cospi_9_64); - const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64, cospi_25_64); - const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64); - const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64); - const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64); - const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64); - const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64, cospi_5_64); - const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64, cospi_21_64); - const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64, cospi_13_64); - const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64, cospi_29_64); - const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64); - const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64); - const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64); - const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64); - const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING); - const __m256i kZero = _mm256_set1_epi16(0); - const __m256i kOne = _mm256_set1_epi16(1); - // Do the two transform/transpose passes - int pass; - for (pass = 0; pass < 2; ++pass) { - // We process sixteen columns (transposed rows in second pass) at a time. - int column_start; - for (column_start = 0; column_start < 32; column_start += 16) { - __m256i step1[32]; - __m256i step2[32]; - __m256i step3[32]; - __m256i out[32]; - // Stage 1 - // Note: even though all the loads below are aligned, using the aligned - // intrinsic make the code slightly slower. - if (0 == pass) { - const int16_t *in = &input[column_start]; - // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - const int16_t *ina = in + 0 * str1; - const int16_t *inb = in + 31 * str1; - __m256i *step1a = &step1[0]; - __m256i *step1b = &step1[31]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = - _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = - _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = - _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = - _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = - _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = - _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[0] = _mm256_add_epi16(ina0, inb0); - step1a[1] = _mm256_add_epi16(ina1, inb1); - step1a[2] = _mm256_add_epi16(ina2, inb2); - step1a[3] = _mm256_add_epi16(ina3, inb3); - step1b[-3] = _mm256_sub_epi16(ina3, inb3); - step1b[-2] = _mm256_sub_epi16(ina2, inb2); - step1b[-1] = _mm256_sub_epi16(ina1, inb1); - step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[0] = _mm256_slli_epi16(step1a[0], 2); - step1a[1] = _mm256_slli_epi16(step1a[1], 2); - step1a[2] = _mm256_slli_epi16(step1a[2], 2); - step1a[3] = _mm256_slli_epi16(step1a[3], 2); - step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 4 * str1; - const int16_t *inb = in + 27 * str1; - __m256i *step1a = &step1[4]; - __m256i *step1b = &step1[27]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = - _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = - _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = - _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = - _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = - _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = - _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[0] = _mm256_add_epi16(ina0, inb0); - step1a[1] = _mm256_add_epi16(ina1, inb1); - step1a[2] = _mm256_add_epi16(ina2, inb2); - step1a[3] = _mm256_add_epi16(ina3, inb3); - step1b[-3] = _mm256_sub_epi16(ina3, inb3); - step1b[-2] = _mm256_sub_epi16(ina2, inb2); - step1b[-1] = _mm256_sub_epi16(ina1, inb1); - step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[0] = _mm256_slli_epi16(step1a[0], 2); - step1a[1] = _mm256_slli_epi16(step1a[1], 2); - step1a[2] = _mm256_slli_epi16(step1a[2], 2); - step1a[3] = _mm256_slli_epi16(step1a[3], 2); - step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 8 * str1; - const int16_t *inb = in + 23 * str1; - __m256i *step1a = &step1[8]; - __m256i *step1b = &step1[23]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = - _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = - _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = - _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = - _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = - _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = - _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[0] = _mm256_add_epi16(ina0, inb0); - step1a[1] = _mm256_add_epi16(ina1, inb1); - step1a[2] = _mm256_add_epi16(ina2, inb2); - step1a[3] = _mm256_add_epi16(ina3, inb3); - step1b[-3] = _mm256_sub_epi16(ina3, inb3); - step1b[-2] = _mm256_sub_epi16(ina2, inb2); - step1b[-1] = _mm256_sub_epi16(ina1, inb1); - step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[0] = _mm256_slli_epi16(step1a[0], 2); - step1a[1] = _mm256_slli_epi16(step1a[1], 2); - step1a[2] = _mm256_slli_epi16(step1a[2], 2); - step1a[3] = _mm256_slli_epi16(step1a[3], 2); - step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 12 * str1; - const int16_t *inb = in + 19 * str1; - __m256i *step1a = &step1[12]; - __m256i *step1b = &step1[19]; - const __m256i ina0 = _mm256_loadu_si256((const __m256i *)(ina)); - const __m256i ina1 = - _mm256_loadu_si256((const __m256i *)(ina + str1)); - const __m256i ina2 = - _mm256_loadu_si256((const __m256i *)(ina + str2)); - const __m256i ina3 = - _mm256_loadu_si256((const __m256i *)(ina + str3)); - const __m256i inb3 = - _mm256_loadu_si256((const __m256i *)(inb - str3)); - const __m256i inb2 = - _mm256_loadu_si256((const __m256i *)(inb - str2)); - const __m256i inb1 = - _mm256_loadu_si256((const __m256i *)(inb - str1)); - const __m256i inb0 = _mm256_loadu_si256((const __m256i *)(inb)); - step1a[0] = _mm256_add_epi16(ina0, inb0); - step1a[1] = _mm256_add_epi16(ina1, inb1); - step1a[2] = _mm256_add_epi16(ina2, inb2); - step1a[3] = _mm256_add_epi16(ina3, inb3); - step1b[-3] = _mm256_sub_epi16(ina3, inb3); - step1b[-2] = _mm256_sub_epi16(ina2, inb2); - step1b[-1] = _mm256_sub_epi16(ina1, inb1); - step1b[-0] = _mm256_sub_epi16(ina0, inb0); - step1a[0] = _mm256_slli_epi16(step1a[0], 2); - step1a[1] = _mm256_slli_epi16(step1a[1], 2); - step1a[2] = _mm256_slli_epi16(step1a[2], 2); - step1a[3] = _mm256_slli_epi16(step1a[3], 2); - step1b[-3] = _mm256_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm256_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm256_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm256_slli_epi16(step1b[-0], 2); - } - } else { - int16_t *in = &intermediate[column_start]; - // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; - // Note: using the same approach as above to have common offset is - // counter-productive as all offsets can be calculated at compile - // time. - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - __m256i in00 = _mm256_loadu_si256((const __m256i *)(in + 0 * 32)); - __m256i in01 = _mm256_loadu_si256((const __m256i *)(in + 1 * 32)); - __m256i in02 = _mm256_loadu_si256((const __m256i *)(in + 2 * 32)); - __m256i in03 = _mm256_loadu_si256((const __m256i *)(in + 3 * 32)); - __m256i in28 = _mm256_loadu_si256((const __m256i *)(in + 28 * 32)); - __m256i in29 = _mm256_loadu_si256((const __m256i *)(in + 29 * 32)); - __m256i in30 = _mm256_loadu_si256((const __m256i *)(in + 30 * 32)); - __m256i in31 = _mm256_loadu_si256((const __m256i *)(in + 31 * 32)); - step1[0] = _mm256_add_epi16(in00, in31); - step1[1] = _mm256_add_epi16(in01, in30); - step1[2] = _mm256_add_epi16(in02, in29); - step1[3] = _mm256_add_epi16(in03, in28); - step1[28] = _mm256_sub_epi16(in03, in28); - step1[29] = _mm256_sub_epi16(in02, in29); - step1[30] = _mm256_sub_epi16(in01, in30); - step1[31] = _mm256_sub_epi16(in00, in31); - } - { - __m256i in04 = _mm256_loadu_si256((const __m256i *)(in + 4 * 32)); - __m256i in05 = _mm256_loadu_si256((const __m256i *)(in + 5 * 32)); - __m256i in06 = _mm256_loadu_si256((const __m256i *)(in + 6 * 32)); - __m256i in07 = _mm256_loadu_si256((const __m256i *)(in + 7 * 32)); - __m256i in24 = _mm256_loadu_si256((const __m256i *)(in + 24 * 32)); - __m256i in25 = _mm256_loadu_si256((const __m256i *)(in + 25 * 32)); - __m256i in26 = _mm256_loadu_si256((const __m256i *)(in + 26 * 32)); - __m256i in27 = _mm256_loadu_si256((const __m256i *)(in + 27 * 32)); - step1[4] = _mm256_add_epi16(in04, in27); - step1[5] = _mm256_add_epi16(in05, in26); - step1[6] = _mm256_add_epi16(in06, in25); - step1[7] = _mm256_add_epi16(in07, in24); - step1[24] = _mm256_sub_epi16(in07, in24); - step1[25] = _mm256_sub_epi16(in06, in25); - step1[26] = _mm256_sub_epi16(in05, in26); - step1[27] = _mm256_sub_epi16(in04, in27); - } - { - __m256i in08 = _mm256_loadu_si256((const __m256i *)(in + 8 * 32)); - __m256i in09 = _mm256_loadu_si256((const __m256i *)(in + 9 * 32)); - __m256i in10 = _mm256_loadu_si256((const __m256i *)(in + 10 * 32)); - __m256i in11 = _mm256_loadu_si256((const __m256i *)(in + 11 * 32)); - __m256i in20 = _mm256_loadu_si256((const __m256i *)(in + 20 * 32)); - __m256i in21 = _mm256_loadu_si256((const __m256i *)(in + 21 * 32)); - __m256i in22 = _mm256_loadu_si256((const __m256i *)(in + 22 * 32)); - __m256i in23 = _mm256_loadu_si256((const __m256i *)(in + 23 * 32)); - step1[8] = _mm256_add_epi16(in08, in23); - step1[9] = _mm256_add_epi16(in09, in22); - step1[10] = _mm256_add_epi16(in10, in21); - step1[11] = _mm256_add_epi16(in11, in20); - step1[20] = _mm256_sub_epi16(in11, in20); - step1[21] = _mm256_sub_epi16(in10, in21); - step1[22] = _mm256_sub_epi16(in09, in22); - step1[23] = _mm256_sub_epi16(in08, in23); - } - { - __m256i in12 = _mm256_loadu_si256((const __m256i *)(in + 12 * 32)); - __m256i in13 = _mm256_loadu_si256((const __m256i *)(in + 13 * 32)); - __m256i in14 = _mm256_loadu_si256((const __m256i *)(in + 14 * 32)); - __m256i in15 = _mm256_loadu_si256((const __m256i *)(in + 15 * 32)); - __m256i in16 = _mm256_loadu_si256((const __m256i *)(in + 16 * 32)); - __m256i in17 = _mm256_loadu_si256((const __m256i *)(in + 17 * 32)); - __m256i in18 = _mm256_loadu_si256((const __m256i *)(in + 18 * 32)); - __m256i in19 = _mm256_loadu_si256((const __m256i *)(in + 19 * 32)); - step1[12] = _mm256_add_epi16(in12, in19); - step1[13] = _mm256_add_epi16(in13, in18); - step1[14] = _mm256_add_epi16(in14, in17); - step1[15] = _mm256_add_epi16(in15, in16); - step1[16] = _mm256_sub_epi16(in15, in16); - step1[17] = _mm256_sub_epi16(in14, in17); - step1[18] = _mm256_sub_epi16(in13, in18); - step1[19] = _mm256_sub_epi16(in12, in19); - } - } - // Stage 2 - { - step2[0] = _mm256_add_epi16(step1[0], step1[15]); - step2[1] = _mm256_add_epi16(step1[1], step1[14]); - step2[2] = _mm256_add_epi16(step1[2], step1[13]); - step2[3] = _mm256_add_epi16(step1[3], step1[12]); - step2[4] = _mm256_add_epi16(step1[4], step1[11]); - step2[5] = _mm256_add_epi16(step1[5], step1[10]); - step2[6] = _mm256_add_epi16(step1[6], step1[9]); - step2[7] = _mm256_add_epi16(step1[7], step1[8]); - step2[8] = _mm256_sub_epi16(step1[7], step1[8]); - step2[9] = _mm256_sub_epi16(step1[6], step1[9]); - step2[10] = _mm256_sub_epi16(step1[5], step1[10]); - step2[11] = _mm256_sub_epi16(step1[4], step1[11]); - step2[12] = _mm256_sub_epi16(step1[3], step1[12]); - step2[13] = _mm256_sub_epi16(step1[2], step1[13]); - step2[14] = _mm256_sub_epi16(step1[1], step1[14]); - step2[15] = _mm256_sub_epi16(step1[0], step1[15]); - } - { - const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]); - const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]); - const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]); - const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]); - const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]); - const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]); - const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]); - const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]); - const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16); - const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16); - const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16); - const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16); - const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16); - const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16); - const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16); - const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16); - const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16); - const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16); - const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16); - const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16); - const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16); - const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16); - const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16); - const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m256i s2_20_4 = - _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); - const __m256i s2_20_5 = - _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); - const __m256i s2_21_4 = - _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); - const __m256i s2_21_5 = - _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); - const __m256i s2_22_4 = - _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); - const __m256i s2_22_5 = - _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); - const __m256i s2_23_4 = - _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); - const __m256i s2_23_5 = - _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); - const __m256i s2_24_4 = - _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); - const __m256i s2_24_5 = - _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); - const __m256i s2_25_4 = - _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); - const __m256i s2_25_5 = - _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); - const __m256i s2_26_4 = - _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); - const __m256i s2_26_5 = - _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); - const __m256i s2_27_4 = - _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); - const __m256i s2_27_5 = - _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); - const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS); - const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS); - const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS); - const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS); - const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS); - const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS); - const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS); - const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS); - const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS); - const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS); - const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS); - const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS); - const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS); - const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS); - const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS); - const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS); - // Combine - step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7); - step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7); - step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7); - step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7); - step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7); - step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7); - step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7); - step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7); - } - -#if !FDCT32x32_HIGH_PRECISION - // dump the magnitude by half, hence the intermediate values are within - // the range of 16 bits. - if (1 == pass) { - __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero, step2[0]); - __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero, step2[1]); - __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero, step2[2]); - __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero, step2[3]); - __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero, step2[4]); - __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero, step2[5]); - __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero, step2[6]); - __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero, step2[7]); - __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero, step2[8]); - __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero, step2[9]); - __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero, step2[10]); - __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero, step2[11]); - __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero, step2[12]); - __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero, step2[13]); - __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero, step2[14]); - __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero, step2[15]); - __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero, step1[16]); - __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero, step1[17]); - __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero, step1[18]); - __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero, step1[19]); - __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero, step2[20]); - __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero, step2[21]); - __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero, step2[22]); - __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero, step2[23]); - __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero, step2[24]); - __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero, step2[25]); - __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero, step2[26]); - __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero, step2[27]); - __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero, step1[28]); - __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero, step1[29]); - __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero, step1[30]); - __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero, step1[31]); - - step2[0] = _mm256_sub_epi16(step2[0], s3_00_0); - step2[1] = _mm256_sub_epi16(step2[1], s3_01_0); - step2[2] = _mm256_sub_epi16(step2[2], s3_02_0); - step2[3] = _mm256_sub_epi16(step2[3], s3_03_0); - step2[4] = _mm256_sub_epi16(step2[4], s3_04_0); - step2[5] = _mm256_sub_epi16(step2[5], s3_05_0); - step2[6] = _mm256_sub_epi16(step2[6], s3_06_0); - step2[7] = _mm256_sub_epi16(step2[7], s3_07_0); - step2[8] = _mm256_sub_epi16(step2[8], s2_08_0); - step2[9] = _mm256_sub_epi16(step2[9], s2_09_0); - step2[10] = _mm256_sub_epi16(step2[10], s3_10_0); - step2[11] = _mm256_sub_epi16(step2[11], s3_11_0); - step2[12] = _mm256_sub_epi16(step2[12], s3_12_0); - step2[13] = _mm256_sub_epi16(step2[13], s3_13_0); - step2[14] = _mm256_sub_epi16(step2[14], s2_14_0); - step2[15] = _mm256_sub_epi16(step2[15], s2_15_0); - step1[16] = _mm256_sub_epi16(step1[16], s3_16_0); - step1[17] = _mm256_sub_epi16(step1[17], s3_17_0); - step1[18] = _mm256_sub_epi16(step1[18], s3_18_0); - step1[19] = _mm256_sub_epi16(step1[19], s3_19_0); - step2[20] = _mm256_sub_epi16(step2[20], s3_20_0); - step2[21] = _mm256_sub_epi16(step2[21], s3_21_0); - step2[22] = _mm256_sub_epi16(step2[22], s3_22_0); - step2[23] = _mm256_sub_epi16(step2[23], s3_23_0); - step2[24] = _mm256_sub_epi16(step2[24], s3_24_0); - step2[25] = _mm256_sub_epi16(step2[25], s3_25_0); - step2[26] = _mm256_sub_epi16(step2[26], s3_26_0); - step2[27] = _mm256_sub_epi16(step2[27], s3_27_0); - step1[28] = _mm256_sub_epi16(step1[28], s3_28_0); - step1[29] = _mm256_sub_epi16(step1[29], s3_29_0); - step1[30] = _mm256_sub_epi16(step1[30], s3_30_0); - step1[31] = _mm256_sub_epi16(step1[31], s3_31_0); - - step2[0] = _mm256_add_epi16(step2[0], kOne); - step2[1] = _mm256_add_epi16(step2[1], kOne); - step2[2] = _mm256_add_epi16(step2[2], kOne); - step2[3] = _mm256_add_epi16(step2[3], kOne); - step2[4] = _mm256_add_epi16(step2[4], kOne); - step2[5] = _mm256_add_epi16(step2[5], kOne); - step2[6] = _mm256_add_epi16(step2[6], kOne); - step2[7] = _mm256_add_epi16(step2[7], kOne); - step2[8] = _mm256_add_epi16(step2[8], kOne); - step2[9] = _mm256_add_epi16(step2[9], kOne); - step2[10] = _mm256_add_epi16(step2[10], kOne); - step2[11] = _mm256_add_epi16(step2[11], kOne); - step2[12] = _mm256_add_epi16(step2[12], kOne); - step2[13] = _mm256_add_epi16(step2[13], kOne); - step2[14] = _mm256_add_epi16(step2[14], kOne); - step2[15] = _mm256_add_epi16(step2[15], kOne); - step1[16] = _mm256_add_epi16(step1[16], kOne); - step1[17] = _mm256_add_epi16(step1[17], kOne); - step1[18] = _mm256_add_epi16(step1[18], kOne); - step1[19] = _mm256_add_epi16(step1[19], kOne); - step2[20] = _mm256_add_epi16(step2[20], kOne); - step2[21] = _mm256_add_epi16(step2[21], kOne); - step2[22] = _mm256_add_epi16(step2[22], kOne); - step2[23] = _mm256_add_epi16(step2[23], kOne); - step2[24] = _mm256_add_epi16(step2[24], kOne); - step2[25] = _mm256_add_epi16(step2[25], kOne); - step2[26] = _mm256_add_epi16(step2[26], kOne); - step2[27] = _mm256_add_epi16(step2[27], kOne); - step1[28] = _mm256_add_epi16(step1[28], kOne); - step1[29] = _mm256_add_epi16(step1[29], kOne); - step1[30] = _mm256_add_epi16(step1[30], kOne); - step1[31] = _mm256_add_epi16(step1[31], kOne); - - step2[0] = _mm256_srai_epi16(step2[0], 2); - step2[1] = _mm256_srai_epi16(step2[1], 2); - step2[2] = _mm256_srai_epi16(step2[2], 2); - step2[3] = _mm256_srai_epi16(step2[3], 2); - step2[4] = _mm256_srai_epi16(step2[4], 2); - step2[5] = _mm256_srai_epi16(step2[5], 2); - step2[6] = _mm256_srai_epi16(step2[6], 2); - step2[7] = _mm256_srai_epi16(step2[7], 2); - step2[8] = _mm256_srai_epi16(step2[8], 2); - step2[9] = _mm256_srai_epi16(step2[9], 2); - step2[10] = _mm256_srai_epi16(step2[10], 2); - step2[11] = _mm256_srai_epi16(step2[11], 2); - step2[12] = _mm256_srai_epi16(step2[12], 2); - step2[13] = _mm256_srai_epi16(step2[13], 2); - step2[14] = _mm256_srai_epi16(step2[14], 2); - step2[15] = _mm256_srai_epi16(step2[15], 2); - step1[16] = _mm256_srai_epi16(step1[16], 2); - step1[17] = _mm256_srai_epi16(step1[17], 2); - step1[18] = _mm256_srai_epi16(step1[18], 2); - step1[19] = _mm256_srai_epi16(step1[19], 2); - step2[20] = _mm256_srai_epi16(step2[20], 2); - step2[21] = _mm256_srai_epi16(step2[21], 2); - step2[22] = _mm256_srai_epi16(step2[22], 2); - step2[23] = _mm256_srai_epi16(step2[23], 2); - step2[24] = _mm256_srai_epi16(step2[24], 2); - step2[25] = _mm256_srai_epi16(step2[25], 2); - step2[26] = _mm256_srai_epi16(step2[26], 2); - step2[27] = _mm256_srai_epi16(step2[27], 2); - step1[28] = _mm256_srai_epi16(step1[28], 2); - step1[29] = _mm256_srai_epi16(step1[29], 2); - step1[30] = _mm256_srai_epi16(step1[30], 2); - step1[31] = _mm256_srai_epi16(step1[31], 2); - } -#endif - -#if FDCT32x32_HIGH_PRECISION - if (pass == 0) { -#endif - // Stage 3 - { - step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]); - step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]); - step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]); - step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]); - step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]); - step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]); - step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]); - step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]); - } - { - const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); - const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); - const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); - const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); - const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m256i s3_10_4 = - _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m256i s3_10_5 = - _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m256i s3_11_4 = - _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m256i s3_11_5 = - _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m256i s3_12_4 = - _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m256i s3_12_5 = - _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m256i s3_13_4 = - _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m256i s3_13_5 = - _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); - const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); - const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); - const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); - const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); - const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); - const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); - const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); - // Combine - step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7); - step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7); - step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7); - step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7); - } - { - step3[16] = _mm256_add_epi16(step2[23], step1[16]); - step3[17] = _mm256_add_epi16(step2[22], step1[17]); - step3[18] = _mm256_add_epi16(step2[21], step1[18]); - step3[19] = _mm256_add_epi16(step2[20], step1[19]); - step3[20] = _mm256_sub_epi16(step1[19], step2[20]); - step3[21] = _mm256_sub_epi16(step1[18], step2[21]); - step3[22] = _mm256_sub_epi16(step1[17], step2[22]); - step3[23] = _mm256_sub_epi16(step1[16], step2[23]); - step3[24] = _mm256_sub_epi16(step1[31], step2[24]); - step3[25] = _mm256_sub_epi16(step1[30], step2[25]); - step3[26] = _mm256_sub_epi16(step1[29], step2[26]); - step3[27] = _mm256_sub_epi16(step1[28], step2[27]); - step3[28] = _mm256_add_epi16(step2[27], step1[28]); - step3[29] = _mm256_add_epi16(step2[26], step1[29]); - step3[30] = _mm256_add_epi16(step2[25], step1[30]); - step3[31] = _mm256_add_epi16(step2[24], step1[31]); - } - - // Stage 4 - { - step1[0] = _mm256_add_epi16(step3[3], step3[0]); - step1[1] = _mm256_add_epi16(step3[2], step3[1]); - step1[2] = _mm256_sub_epi16(step3[1], step3[2]); - step1[3] = _mm256_sub_epi16(step3[0], step3[3]); - step1[8] = _mm256_add_epi16(step3[11], step2[8]); - step1[9] = _mm256_add_epi16(step3[10], step2[9]); - step1[10] = _mm256_sub_epi16(step2[9], step3[10]); - step1[11] = _mm256_sub_epi16(step2[8], step3[11]); - step1[12] = _mm256_sub_epi16(step2[15], step3[12]); - step1[13] = _mm256_sub_epi16(step2[14], step3[13]); - step1[14] = _mm256_add_epi16(step3[13], step2[14]); - step1[15] = _mm256_add_epi16(step3[12], step2[15]); - } - { - const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]); - const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]); - const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m256i s1_05_4 = - _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m256i s1_05_5 = - _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m256i s1_06_4 = - _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m256i s1_06_5 = - _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS); - const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS); - const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS); - const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS); - // Combine - step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7); - step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7); - } - { - const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]); - const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]); - const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]); - const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]); - const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]); - const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]); - const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]); - const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]); - const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m256i s1_18_4 = - _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m256i s1_18_5 = - _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m256i s1_19_4 = - _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m256i s1_19_5 = - _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m256i s1_20_4 = - _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m256i s1_20_5 = - _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m256i s1_21_4 = - _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m256i s1_21_5 = - _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m256i s1_26_4 = - _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m256i s1_26_5 = - _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m256i s1_27_4 = - _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m256i s1_27_5 = - _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m256i s1_28_4 = - _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m256i s1_28_5 = - _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m256i s1_29_4 = - _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m256i s1_29_5 = - _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS); - const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS); - const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS); - const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS); - const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS); - const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS); - const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS); - const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS); - const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS); - const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS); - const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS); - const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS); - const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS); - const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS); - const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS); - const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS); - // Combine - step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7); - step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7); - step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7); - step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7); - step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7); - step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7); - step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7); - step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7); - } - // Stage 5 - { - step2[4] = _mm256_add_epi16(step1[5], step3[4]); - step2[5] = _mm256_sub_epi16(step3[4], step1[5]); - step2[6] = _mm256_sub_epi16(step3[7], step1[6]); - step2[7] = _mm256_add_epi16(step1[6], step3[7]); - } - { - const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]); - const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]); - const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]); - const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]); - const __m256i out_00_2 = - _mm256_madd_epi16(out_00_0, k__cospi_p16_p16); - const __m256i out_00_3 = - _mm256_madd_epi16(out_00_1, k__cospi_p16_p16); - const __m256i out_16_2 = - _mm256_madd_epi16(out_00_0, k__cospi_p16_m16); - const __m256i out_16_3 = - _mm256_madd_epi16(out_00_1, k__cospi_p16_m16); - const __m256i out_08_2 = - _mm256_madd_epi16(out_08_0, k__cospi_p24_p08); - const __m256i out_08_3 = - _mm256_madd_epi16(out_08_1, k__cospi_p24_p08); - const __m256i out_24_2 = - _mm256_madd_epi16(out_08_0, k__cospi_m08_p24); - const __m256i out_24_3 = - _mm256_madd_epi16(out_08_1, k__cospi_m08_p24); - // dct_const_round_shift - const __m256i out_00_4 = - _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); - const __m256i out_00_5 = - _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); - const __m256i out_16_4 = - _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); - const __m256i out_16_5 = - _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); - const __m256i out_08_4 = - _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); - const __m256i out_08_5 = - _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); - const __m256i out_24_4 = - _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); - const __m256i out_24_5 = - _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); - const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS); - const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS); - const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS); - const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS); - const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS); - const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS); - const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS); - const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS); - // Combine - out[0] = _mm256_packs_epi32(out_00_6, out_00_7); - out[16] = _mm256_packs_epi32(out_16_6, out_16_7); - out[8] = _mm256_packs_epi32(out_08_6, out_08_7); - out[24] = _mm256_packs_epi32(out_24_6, out_24_7); - } - { - const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[9], step1[14]); - const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[9], step1[14]); - const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]); - const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]); - const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24); - const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24); - const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08); - const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08); - const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24); - const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24); - const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08); - const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m256i s2_09_4 = - _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); - const __m256i s2_09_5 = - _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); - const __m256i s2_10_4 = - _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); - const __m256i s2_10_5 = - _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); - const __m256i s2_13_4 = - _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); - const __m256i s2_13_5 = - _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); - const __m256i s2_14_4 = - _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); - const __m256i s2_14_5 = - _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); - const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS); - const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS); - const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS); - const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS); - const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS); - const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS); - const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS); - const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS); - // Combine - step2[9] = _mm256_packs_epi32(s2_09_6, s2_09_7); - step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7); - step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7); - step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7); - } - { - step2[16] = _mm256_add_epi16(step1[19], step3[16]); - step2[17] = _mm256_add_epi16(step1[18], step3[17]); - step2[18] = _mm256_sub_epi16(step3[17], step1[18]); - step2[19] = _mm256_sub_epi16(step3[16], step1[19]); - step2[20] = _mm256_sub_epi16(step3[23], step1[20]); - step2[21] = _mm256_sub_epi16(step3[22], step1[21]); - step2[22] = _mm256_add_epi16(step1[21], step3[22]); - step2[23] = _mm256_add_epi16(step1[20], step3[23]); - step2[24] = _mm256_add_epi16(step1[27], step3[24]); - step2[25] = _mm256_add_epi16(step1[26], step3[25]); - step2[26] = _mm256_sub_epi16(step3[25], step1[26]); - step2[27] = _mm256_sub_epi16(step3[24], step1[27]); - step2[28] = _mm256_sub_epi16(step3[31], step1[28]); - step2[29] = _mm256_sub_epi16(step3[30], step1[29]); - step2[30] = _mm256_add_epi16(step1[29], step3[30]); - step2[31] = _mm256_add_epi16(step1[28], step3[31]); - } - // Stage 6 - { - const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); - const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); - const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); - const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); - const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]); - const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]); - const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]); - const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]); - const __m256i out_04_2 = - _mm256_madd_epi16(out_04_0, k__cospi_p28_p04); - const __m256i out_04_3 = - _mm256_madd_epi16(out_04_1, k__cospi_p28_p04); - const __m256i out_20_2 = - _mm256_madd_epi16(out_20_0, k__cospi_p12_p20); - const __m256i out_20_3 = - _mm256_madd_epi16(out_20_1, k__cospi_p12_p20); - const __m256i out_12_2 = - _mm256_madd_epi16(out_12_0, k__cospi_m20_p12); - const __m256i out_12_3 = - _mm256_madd_epi16(out_12_1, k__cospi_m20_p12); - const __m256i out_28_2 = - _mm256_madd_epi16(out_28_0, k__cospi_m04_p28); - const __m256i out_28_3 = - _mm256_madd_epi16(out_28_1, k__cospi_m04_p28); - // dct_const_round_shift - const __m256i out_04_4 = - _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); - const __m256i out_04_5 = - _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); - const __m256i out_20_4 = - _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); - const __m256i out_20_5 = - _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); - const __m256i out_12_4 = - _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); - const __m256i out_12_5 = - _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); - const __m256i out_28_4 = - _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); - const __m256i out_28_5 = - _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); - const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS); - const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS); - const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS); - const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS); - const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS); - const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS); - const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS); - const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS); - // Combine - out[4] = _mm256_packs_epi32(out_04_6, out_04_7); - out[20] = _mm256_packs_epi32(out_20_6, out_20_7); - out[12] = _mm256_packs_epi32(out_12_6, out_12_7); - out[28] = _mm256_packs_epi32(out_28_6, out_28_7); - } - { - step3[8] = _mm256_add_epi16(step2[9], step1[8]); - step3[9] = _mm256_sub_epi16(step1[8], step2[9]); - step3[10] = _mm256_sub_epi16(step1[11], step2[10]); - step3[11] = _mm256_add_epi16(step2[10], step1[11]); - step3[12] = _mm256_add_epi16(step2[13], step1[12]); - step3[13] = _mm256_sub_epi16(step1[12], step2[13]); - step3[14] = _mm256_sub_epi16(step1[15], step2[14]); - step3[15] = _mm256_add_epi16(step2[14], step1[15]); - } - { - const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]); - const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]); - const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]); - const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]); - const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]); - const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]); - const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]); - const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]); - const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28); - const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28); - const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04); - const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04); - const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12); - const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12); - const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20); - const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20); - const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12); - const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12); - const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20); - const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20); - const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28); - const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28); - const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04); - const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04); - // dct_const_round_shift - const __m256i s3_17_4 = - _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); - const __m256i s3_17_5 = - _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); - const __m256i s3_18_4 = - _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); - const __m256i s3_18_5 = - _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); - const __m256i s3_21_4 = - _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); - const __m256i s3_21_5 = - _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); - const __m256i s3_22_4 = - _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); - const __m256i s3_22_5 = - _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); - const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS); - const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS); - const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS); - const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS); - const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS); - const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS); - const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS); - const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS); - const __m256i s3_25_4 = - _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); - const __m256i s3_25_5 = - _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); - const __m256i s3_26_4 = - _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); - const __m256i s3_26_5 = - _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); - const __m256i s3_29_4 = - _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); - const __m256i s3_29_5 = - _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); - const __m256i s3_30_4 = - _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); - const __m256i s3_30_5 = - _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); - const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS); - const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS); - const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS); - const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS); - const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS); - const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS); - const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS); - const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS); - // Combine - step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7); - step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7); - step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7); - step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7); - // Combine - step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7); - step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7); - step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7); - step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7); - } - // Stage 7 - { - const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[8], step3[15]); - const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[8], step3[15]); - const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[9], step3[14]); - const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[9], step3[14]); - const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]); - const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]); - const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]); - const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]); - const __m256i out_02_2 = - _mm256_madd_epi16(out_02_0, k__cospi_p30_p02); - const __m256i out_02_3 = - _mm256_madd_epi16(out_02_1, k__cospi_p30_p02); - const __m256i out_18_2 = - _mm256_madd_epi16(out_18_0, k__cospi_p14_p18); - const __m256i out_18_3 = - _mm256_madd_epi16(out_18_1, k__cospi_p14_p18); - const __m256i out_10_2 = - _mm256_madd_epi16(out_10_0, k__cospi_p22_p10); - const __m256i out_10_3 = - _mm256_madd_epi16(out_10_1, k__cospi_p22_p10); - const __m256i out_26_2 = - _mm256_madd_epi16(out_26_0, k__cospi_p06_p26); - const __m256i out_26_3 = - _mm256_madd_epi16(out_26_1, k__cospi_p06_p26); - const __m256i out_06_2 = - _mm256_madd_epi16(out_26_0, k__cospi_m26_p06); - const __m256i out_06_3 = - _mm256_madd_epi16(out_26_1, k__cospi_m26_p06); - const __m256i out_22_2 = - _mm256_madd_epi16(out_10_0, k__cospi_m10_p22); - const __m256i out_22_3 = - _mm256_madd_epi16(out_10_1, k__cospi_m10_p22); - const __m256i out_14_2 = - _mm256_madd_epi16(out_18_0, k__cospi_m18_p14); - const __m256i out_14_3 = - _mm256_madd_epi16(out_18_1, k__cospi_m18_p14); - const __m256i out_30_2 = - _mm256_madd_epi16(out_02_0, k__cospi_m02_p30); - const __m256i out_30_3 = - _mm256_madd_epi16(out_02_1, k__cospi_m02_p30); - // dct_const_round_shift - const __m256i out_02_4 = - _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); - const __m256i out_02_5 = - _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); - const __m256i out_18_4 = - _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); - const __m256i out_18_5 = - _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); - const __m256i out_10_4 = - _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); - const __m256i out_10_5 = - _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); - const __m256i out_26_4 = - _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); - const __m256i out_26_5 = - _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); - const __m256i out_06_4 = - _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); - const __m256i out_06_5 = - _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); - const __m256i out_22_4 = - _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); - const __m256i out_22_5 = - _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); - const __m256i out_14_4 = - _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); - const __m256i out_14_5 = - _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); - const __m256i out_30_4 = - _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); - const __m256i out_30_5 = - _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); - const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS); - const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS); - const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS); - const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS); - const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS); - const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS); - const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS); - const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS); - const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS); - const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS); - const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS); - const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS); - const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS); - const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS); - const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS); - const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS); - // Combine - out[2] = _mm256_packs_epi32(out_02_6, out_02_7); - out[18] = _mm256_packs_epi32(out_18_6, out_18_7); - out[10] = _mm256_packs_epi32(out_10_6, out_10_7); - out[26] = _mm256_packs_epi32(out_26_6, out_26_7); - out[6] = _mm256_packs_epi32(out_06_6, out_06_7); - out[22] = _mm256_packs_epi32(out_22_6, out_22_7); - out[14] = _mm256_packs_epi32(out_14_6, out_14_7); - out[30] = _mm256_packs_epi32(out_30_6, out_30_7); - } - { - step1[16] = _mm256_add_epi16(step3[17], step2[16]); - step1[17] = _mm256_sub_epi16(step2[16], step3[17]); - step1[18] = _mm256_sub_epi16(step2[19], step3[18]); - step1[19] = _mm256_add_epi16(step3[18], step2[19]); - step1[20] = _mm256_add_epi16(step3[21], step2[20]); - step1[21] = _mm256_sub_epi16(step2[20], step3[21]); - step1[22] = _mm256_sub_epi16(step2[23], step3[22]); - step1[23] = _mm256_add_epi16(step3[22], step2[23]); - step1[24] = _mm256_add_epi16(step3[25], step2[24]); - step1[25] = _mm256_sub_epi16(step2[24], step3[25]); - step1[26] = _mm256_sub_epi16(step2[27], step3[26]); - step1[27] = _mm256_add_epi16(step3[26], step2[27]); - step1[28] = _mm256_add_epi16(step3[29], step2[28]); - step1[29] = _mm256_sub_epi16(step2[28], step3[29]); - step1[30] = _mm256_sub_epi16(step2[31], step3[30]); - step1[31] = _mm256_add_epi16(step3[30], step2[31]); - } - // Final stage --- outputs indices are bit-reversed. - { - const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]); - const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]); - const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]); - const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]); - const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]); - const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]); - const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]); - const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]); - const __m256i out_01_2 = - _mm256_madd_epi16(out_01_0, k__cospi_p31_p01); - const __m256i out_01_3 = - _mm256_madd_epi16(out_01_1, k__cospi_p31_p01); - const __m256i out_17_2 = - _mm256_madd_epi16(out_17_0, k__cospi_p15_p17); - const __m256i out_17_3 = - _mm256_madd_epi16(out_17_1, k__cospi_p15_p17); - const __m256i out_09_2 = - _mm256_madd_epi16(out_09_0, k__cospi_p23_p09); - const __m256i out_09_3 = - _mm256_madd_epi16(out_09_1, k__cospi_p23_p09); - const __m256i out_25_2 = - _mm256_madd_epi16(out_25_0, k__cospi_p07_p25); - const __m256i out_25_3 = - _mm256_madd_epi16(out_25_1, k__cospi_p07_p25); - const __m256i out_07_2 = - _mm256_madd_epi16(out_25_0, k__cospi_m25_p07); - const __m256i out_07_3 = - _mm256_madd_epi16(out_25_1, k__cospi_m25_p07); - const __m256i out_23_2 = - _mm256_madd_epi16(out_09_0, k__cospi_m09_p23); - const __m256i out_23_3 = - _mm256_madd_epi16(out_09_1, k__cospi_m09_p23); - const __m256i out_15_2 = - _mm256_madd_epi16(out_17_0, k__cospi_m17_p15); - const __m256i out_15_3 = - _mm256_madd_epi16(out_17_1, k__cospi_m17_p15); - const __m256i out_31_2 = - _mm256_madd_epi16(out_01_0, k__cospi_m01_p31); - const __m256i out_31_3 = - _mm256_madd_epi16(out_01_1, k__cospi_m01_p31); - // dct_const_round_shift - const __m256i out_01_4 = - _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); - const __m256i out_01_5 = - _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); - const __m256i out_17_4 = - _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); - const __m256i out_17_5 = - _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); - const __m256i out_09_4 = - _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); - const __m256i out_09_5 = - _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); - const __m256i out_25_4 = - _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); - const __m256i out_25_5 = - _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); - const __m256i out_07_4 = - _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); - const __m256i out_07_5 = - _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); - const __m256i out_23_4 = - _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); - const __m256i out_23_5 = - _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); - const __m256i out_15_4 = - _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); - const __m256i out_15_5 = - _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); - const __m256i out_31_4 = - _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); - const __m256i out_31_5 = - _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); - const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS); - const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS); - const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS); - const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS); - const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS); - const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS); - const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS); - const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS); - const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS); - const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS); - const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS); - const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS); - const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS); - const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS); - const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS); - const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS); - // Combine - out[1] = _mm256_packs_epi32(out_01_6, out_01_7); - out[17] = _mm256_packs_epi32(out_17_6, out_17_7); - out[9] = _mm256_packs_epi32(out_09_6, out_09_7); - out[25] = _mm256_packs_epi32(out_25_6, out_25_7); - out[7] = _mm256_packs_epi32(out_07_6, out_07_7); - out[23] = _mm256_packs_epi32(out_23_6, out_23_7); - out[15] = _mm256_packs_epi32(out_15_6, out_15_7); - out[31] = _mm256_packs_epi32(out_31_6, out_31_7); - } - { - const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]); - const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]); - const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]); - const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]); - const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]); - const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]); - const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]); - const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]); - const __m256i out_05_2 = - _mm256_madd_epi16(out_05_0, k__cospi_p27_p05); - const __m256i out_05_3 = - _mm256_madd_epi16(out_05_1, k__cospi_p27_p05); - const __m256i out_21_2 = - _mm256_madd_epi16(out_21_0, k__cospi_p11_p21); - const __m256i out_21_3 = - _mm256_madd_epi16(out_21_1, k__cospi_p11_p21); - const __m256i out_13_2 = - _mm256_madd_epi16(out_13_0, k__cospi_p19_p13); - const __m256i out_13_3 = - _mm256_madd_epi16(out_13_1, k__cospi_p19_p13); - const __m256i out_29_2 = - _mm256_madd_epi16(out_29_0, k__cospi_p03_p29); - const __m256i out_29_3 = - _mm256_madd_epi16(out_29_1, k__cospi_p03_p29); - const __m256i out_03_2 = - _mm256_madd_epi16(out_29_0, k__cospi_m29_p03); - const __m256i out_03_3 = - _mm256_madd_epi16(out_29_1, k__cospi_m29_p03); - const __m256i out_19_2 = - _mm256_madd_epi16(out_13_0, k__cospi_m13_p19); - const __m256i out_19_3 = - _mm256_madd_epi16(out_13_1, k__cospi_m13_p19); - const __m256i out_11_2 = - _mm256_madd_epi16(out_21_0, k__cospi_m21_p11); - const __m256i out_11_3 = - _mm256_madd_epi16(out_21_1, k__cospi_m21_p11); - const __m256i out_27_2 = - _mm256_madd_epi16(out_05_0, k__cospi_m05_p27); - const __m256i out_27_3 = - _mm256_madd_epi16(out_05_1, k__cospi_m05_p27); - // dct_const_round_shift - const __m256i out_05_4 = - _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); - const __m256i out_05_5 = - _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); - const __m256i out_21_4 = - _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); - const __m256i out_21_5 = - _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); - const __m256i out_13_4 = - _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); - const __m256i out_13_5 = - _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); - const __m256i out_29_4 = - _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); - const __m256i out_29_5 = - _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); - const __m256i out_03_4 = - _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); - const __m256i out_03_5 = - _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); - const __m256i out_19_4 = - _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); - const __m256i out_19_5 = - _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); - const __m256i out_11_4 = - _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); - const __m256i out_11_5 = - _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); - const __m256i out_27_4 = - _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); - const __m256i out_27_5 = - _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); - const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS); - const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS); - const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS); - const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS); - const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS); - const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS); - const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS); - const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS); - const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS); - const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS); - const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS); - const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS); - const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS); - const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS); - const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS); - const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS); - // Combine - out[5] = _mm256_packs_epi32(out_05_6, out_05_7); - out[21] = _mm256_packs_epi32(out_21_6, out_21_7); - out[13] = _mm256_packs_epi32(out_13_6, out_13_7); - out[29] = _mm256_packs_epi32(out_29_6, out_29_7); - out[3] = _mm256_packs_epi32(out_03_6, out_03_7); - out[19] = _mm256_packs_epi32(out_19_6, out_19_7); - out[11] = _mm256_packs_epi32(out_11_6, out_11_7); - out[27] = _mm256_packs_epi32(out_27_6, out_27_7); - } -#if FDCT32x32_HIGH_PRECISION - } else { - __m256i lstep1[64], lstep2[64], lstep3[64]; - __m256i u[32], v[32], sign[16]; - const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1); - // start using 32-bit operations - // stage 3 - { - // expanding to 32-bit length priori to addition operations - lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero); - lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero); - lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero); - lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero); - lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero); - lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero); - lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero); - lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero); - lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero); - lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero); - lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero); - lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero); - lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero); - lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero); - lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero); - lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero); - lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne); - lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne); - lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne); - lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne); - lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne); - lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne); - lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne); - lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne); - lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne); - lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne); - lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne); - lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne); - lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne); - lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne); - lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne); - lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne); - - lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]); - lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]); - lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]); - lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]); - lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]); - lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]); - lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]); - lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]); - lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]); - lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]); - lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]); - lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]); - lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]); - lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]); - lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]); - lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]); - } - { - const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]); - const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]); - const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]); - const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]); - const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m256i s3_10_4 = - _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m256i s3_10_5 = - _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m256i s3_11_4 = - _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m256i s3_11_5 = - _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m256i s3_12_4 = - _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m256i s3_12_5 = - _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m256i s3_13_4 = - _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m256i s3_13_5 = - _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS); - lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS); - lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS); - lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS); - lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS); - lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS); - lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS); - lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS); - } - { - lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero); - lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero); - lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero); - lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero); - lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero); - lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero); - lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero); - lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero); - lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero); - lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero); - lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero); - lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero); - lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero); - lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero); - lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero); - lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero); - lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne); - lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne); - lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne); - lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne); - lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne); - lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne); - lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne); - lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne); - lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne); - lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne); - lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne); - lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne); - lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne); - lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne); - lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne); - lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne); - - lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero); - lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero); - lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero); - lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero); - lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero); - lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero); - lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero); - lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero); - lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero); - lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero); - lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero); - lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero); - lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero); - lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero); - lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero); - lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero); - lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne); - lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne); - lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne); - lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne); - lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne); - lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne); - lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne); - lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne); - lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne); - lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne); - lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne); - lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne); - lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne); - lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne); - lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne); - lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne); - - lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]); - lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]); - - lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]); - lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]); - lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]); - lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]); - lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]); - lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]); - lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]); - lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]); - lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]); - lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]); - lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]); - lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]); - lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]); - lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]); - lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]); - lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]); - lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]); - lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]); - lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]); - lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]); - lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]); - lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]); - lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]); - lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]); - lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]); - lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]); - lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]); - lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]); - lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]); - lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]); - } - - // stage 4 - { - // expanding to 32-bit length priori to addition operations - lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero); - lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero); - lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero); - lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero); - lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero); - lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero); - lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero); - lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero); - lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne); - lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne); - lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne); - lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne); - lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne); - lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne); - lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne); - lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne); - - lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]); - lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]); - lstep1[2] = _mm256_add_epi32(lstep3[4], lstep3[2]); - lstep1[3] = _mm256_add_epi32(lstep3[5], lstep3[3]); - lstep1[4] = _mm256_sub_epi32(lstep3[2], lstep3[4]); - lstep1[5] = _mm256_sub_epi32(lstep3[3], lstep3[5]); - lstep1[6] = _mm256_sub_epi32(lstep3[0], lstep3[6]); - lstep1[7] = _mm256_sub_epi32(lstep3[1], lstep3[7]); - lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]); - lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]); - lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]); - lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]); - lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]); - lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]); - lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]); - lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]); - lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]); - lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]); - lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]); - lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]); - lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]); - lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]); - lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]); - lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]); - } - { - // to be continued... - // - const __m256i k32_p16_p16 = - pair256_set_epi32(cospi_16_64, cospi_16_64); - const __m256i k32_p16_m16 = - pair256_set_epi32(cospi_16_64, -cospi_16_64); - - u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]); - u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]); - u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]); - u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]); - - // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide - // instruction latency. - v[0] = k_madd_epi32_avx2(u[0], k32_p16_m16); - v[1] = k_madd_epi32_avx2(u[1], k32_p16_m16); - v[2] = k_madd_epi32_avx2(u[2], k32_p16_m16); - v[3] = k_madd_epi32_avx2(u[3], k32_p16_m16); - v[4] = k_madd_epi32_avx2(u[0], k32_p16_p16); - v[5] = k_madd_epi32_avx2(u[1], k32_p16_p16); - v[6] = k_madd_epi32_avx2(u[2], k32_p16_p16); - v[7] = k_madd_epi32_avx2(u[3], k32_p16_p16); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - - lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - } - { - const __m256i k32_m08_p24 = - pair256_set_epi32(-cospi_8_64, cospi_24_64); - const __m256i k32_m24_m08 = - pair256_set_epi32(-cospi_24_64, -cospi_8_64); - const __m256i k32_p24_p08 = - pair256_set_epi32(cospi_24_64, cospi_8_64); - - u[0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]); - u[1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]); - u[2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]); - u[3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]); - u[4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]); - u[5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]); - u[6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]); - u[7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]); - u[8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]); - u[9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]); - u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]); - u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]); - u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]); - u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]); - u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]); - u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]); - - v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); - v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); - v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); - v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); - v[4] = k_madd_epi32_avx2(u[4], k32_m08_p24); - v[5] = k_madd_epi32_avx2(u[5], k32_m08_p24); - v[6] = k_madd_epi32_avx2(u[6], k32_m08_p24); - v[7] = k_madd_epi32_avx2(u[7], k32_m08_p24); - v[8] = k_madd_epi32_avx2(u[8], k32_m24_m08); - v[9] = k_madd_epi32_avx2(u[9], k32_m24_m08); - v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08); - v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08); - v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08); - v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08); - v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08); - v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08); - v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24); - v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24); - v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24); - v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24); - v[20] = k_madd_epi32_avx2(u[8], k32_m08_p24); - v[21] = k_madd_epi32_avx2(u[9], k32_m08_p24); - v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24); - v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24); - v[24] = k_madd_epi32_avx2(u[4], k32_p24_p08); - v[25] = k_madd_epi32_avx2(u[5], k32_p24_p08); - v[26] = k_madd_epi32_avx2(u[6], k32_p24_p08); - v[27] = k_madd_epi32_avx2(u[7], k32_p24_p08); - v[28] = k_madd_epi32_avx2(u[0], k32_p24_p08); - v[29] = k_madd_epi32_avx2(u[1], k32_p24_p08); - v[30] = k_madd_epi32_avx2(u[2], k32_p24_p08); - v[31] = k_madd_epi32_avx2(u[3], k32_p24_p08); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - u[8] = k_packs_epi64_avx2(v[16], v[17]); - u[9] = k_packs_epi64_avx2(v[18], v[19]); - u[10] = k_packs_epi64_avx2(v[20], v[21]); - u[11] = k_packs_epi64_avx2(v[22], v[23]); - u[12] = k_packs_epi64_avx2(v[24], v[25]); - u[13] = k_packs_epi64_avx2(v[26], v[27]); - u[14] = k_packs_epi64_avx2(v[28], v[29]); - u[15] = k_packs_epi64_avx2(v[30], v[31]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - lstep1[36] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[37] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[38] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[39] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - lstep1[40] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - lstep1[41] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - lstep1[42] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - lstep1[43] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - lstep1[52] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); - lstep1[53] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); - lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); - lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); - lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); - lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); - lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); - lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - } - // stage 5 - { - lstep2[8] = _mm256_add_epi32(lstep1[10], lstep3[8]); - lstep2[9] = _mm256_add_epi32(lstep1[11], lstep3[9]); - lstep2[10] = _mm256_sub_epi32(lstep3[8], lstep1[10]); - lstep2[11] = _mm256_sub_epi32(lstep3[9], lstep1[11]); - lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]); - lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]); - lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]); - lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]); - } - { - const __m256i k32_p16_p16 = - pair256_set_epi32(cospi_16_64, cospi_16_64); - const __m256i k32_p16_m16 = - pair256_set_epi32(cospi_16_64, -cospi_16_64); - const __m256i k32_p24_p08 = - pair256_set_epi32(cospi_24_64, cospi_8_64); - const __m256i k32_m08_p24 = - pair256_set_epi32(-cospi_8_64, cospi_24_64); - - u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]); - u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]); - u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]); - u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]); - u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]); - u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]); - u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]); - u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]); - - // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide - // instruction latency. - v[0] = k_madd_epi32_avx2(u[0], k32_p16_p16); - v[1] = k_madd_epi32_avx2(u[1], k32_p16_p16); - v[2] = k_madd_epi32_avx2(u[2], k32_p16_p16); - v[3] = k_madd_epi32_avx2(u[3], k32_p16_p16); - v[4] = k_madd_epi32_avx2(u[0], k32_p16_m16); - v[5] = k_madd_epi32_avx2(u[1], k32_p16_m16); - v[6] = k_madd_epi32_avx2(u[2], k32_p16_m16); - v[7] = k_madd_epi32_avx2(u[3], k32_p16_m16); - v[8] = k_madd_epi32_avx2(u[4], k32_p24_p08); - v[9] = k_madd_epi32_avx2(u[5], k32_p24_p08); - v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08); - v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08); - v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24); - v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24); - v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24); - v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - - sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); - sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); - sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); - sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); - sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); - sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); - sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); - sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); - - u[0] = _mm256_sub_epi32(u[0], sign[0]); - u[1] = _mm256_sub_epi32(u[1], sign[1]); - u[2] = _mm256_sub_epi32(u[2], sign[2]); - u[3] = _mm256_sub_epi32(u[3], sign[3]); - u[4] = _mm256_sub_epi32(u[4], sign[4]); - u[5] = _mm256_sub_epi32(u[5], sign[5]); - u[6] = _mm256_sub_epi32(u[6], sign[6]); - u[7] = _mm256_sub_epi32(u[7], sign[7]); - - u[0] = _mm256_add_epi32(u[0], K32One); - u[1] = _mm256_add_epi32(u[1], K32One); - u[2] = _mm256_add_epi32(u[2], K32One); - u[3] = _mm256_add_epi32(u[3], K32One); - u[4] = _mm256_add_epi32(u[4], K32One); - u[5] = _mm256_add_epi32(u[5], K32One); - u[6] = _mm256_add_epi32(u[6], K32One); - u[7] = _mm256_add_epi32(u[7], K32One); - - u[0] = _mm256_srai_epi32(u[0], 2); - u[1] = _mm256_srai_epi32(u[1], 2); - u[2] = _mm256_srai_epi32(u[2], 2); - u[3] = _mm256_srai_epi32(u[3], 2); - u[4] = _mm256_srai_epi32(u[4], 2); - u[5] = _mm256_srai_epi32(u[5], 2); - u[6] = _mm256_srai_epi32(u[6], 2); - u[7] = _mm256_srai_epi32(u[7], 2); - - // Combine - out[0] = _mm256_packs_epi32(u[0], u[1]); - out[16] = _mm256_packs_epi32(u[2], u[3]); - out[8] = _mm256_packs_epi32(u[4], u[5]); - out[24] = _mm256_packs_epi32(u[6], u[7]); - } - { - const __m256i k32_m08_p24 = - pair256_set_epi32(-cospi_8_64, cospi_24_64); - const __m256i k32_m24_m08 = - pair256_set_epi32(-cospi_24_64, -cospi_8_64); - const __m256i k32_p24_p08 = - pair256_set_epi32(cospi_24_64, cospi_8_64); - - u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]); - u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]); - u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]); - u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]); - u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]); - u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]); - u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]); - u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]); - - v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24); - v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24); - v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24); - v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24); - v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08); - v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08); - v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08); - v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08); - v[8] = k_madd_epi32_avx2(u[4], k32_m08_p24); - v[9] = k_madd_epi32_avx2(u[5], k32_m08_p24); - v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24); - v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24); - v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08); - v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08); - v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08); - v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - - u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS); - lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS); - lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS); - lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS); - lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS); - lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS); - lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS); - lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS); - } - { - lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]); - lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]); - lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]); - lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]); - lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]); - lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]); - lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]); - lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]); - lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]); - lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]); - lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]); - lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]); - lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]); - lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]); - lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]); - lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]); - lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]); - lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]); - lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]); - lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]); - lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]); - lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]); - lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]); - lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]); - lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]); - lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]); - lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]); - lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]); - lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]); - lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]); - lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]); - lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]); - } - // stage 6 - { - const __m256i k32_p28_p04 = - pair256_set_epi32(cospi_28_64, cospi_4_64); - const __m256i k32_p12_p20 = - pair256_set_epi32(cospi_12_64, cospi_20_64); - const __m256i k32_m20_p12 = - pair256_set_epi32(-cospi_20_64, cospi_12_64); - const __m256i k32_m04_p28 = - pair256_set_epi32(-cospi_4_64, cospi_28_64); - - u[0] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); - u[1] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); - u[2] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); - u[3] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); - u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); - u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); - u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); - u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); - u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]); - u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]); - u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]); - u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]); - u[12] = _mm256_unpacklo_epi32(lstep2[8], lstep2[14]); - u[13] = _mm256_unpackhi_epi32(lstep2[8], lstep2[14]); - u[14] = _mm256_unpacklo_epi32(lstep2[9], lstep2[15]); - u[15] = _mm256_unpackhi_epi32(lstep2[9], lstep2[15]); - - v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04); - v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04); - v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04); - v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04); - v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20); - v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20); - v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20); - v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20); - v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); - v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); - v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); - v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); - v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28); - v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28); - v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28); - v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - - sign[0] = _mm256_cmpgt_epi32(kZero, u[0]); - sign[1] = _mm256_cmpgt_epi32(kZero, u[1]); - sign[2] = _mm256_cmpgt_epi32(kZero, u[2]); - sign[3] = _mm256_cmpgt_epi32(kZero, u[3]); - sign[4] = _mm256_cmpgt_epi32(kZero, u[4]); - sign[5] = _mm256_cmpgt_epi32(kZero, u[5]); - sign[6] = _mm256_cmpgt_epi32(kZero, u[6]); - sign[7] = _mm256_cmpgt_epi32(kZero, u[7]); - - u[0] = _mm256_sub_epi32(u[0], sign[0]); - u[1] = _mm256_sub_epi32(u[1], sign[1]); - u[2] = _mm256_sub_epi32(u[2], sign[2]); - u[3] = _mm256_sub_epi32(u[3], sign[3]); - u[4] = _mm256_sub_epi32(u[4], sign[4]); - u[5] = _mm256_sub_epi32(u[5], sign[5]); - u[6] = _mm256_sub_epi32(u[6], sign[6]); - u[7] = _mm256_sub_epi32(u[7], sign[7]); - - u[0] = _mm256_add_epi32(u[0], K32One); - u[1] = _mm256_add_epi32(u[1], K32One); - u[2] = _mm256_add_epi32(u[2], K32One); - u[3] = _mm256_add_epi32(u[3], K32One); - u[4] = _mm256_add_epi32(u[4], K32One); - u[5] = _mm256_add_epi32(u[5], K32One); - u[6] = _mm256_add_epi32(u[6], K32One); - u[7] = _mm256_add_epi32(u[7], K32One); - - u[0] = _mm256_srai_epi32(u[0], 2); - u[1] = _mm256_srai_epi32(u[1], 2); - u[2] = _mm256_srai_epi32(u[2], 2); - u[3] = _mm256_srai_epi32(u[3], 2); - u[4] = _mm256_srai_epi32(u[4], 2); - u[5] = _mm256_srai_epi32(u[5], 2); - u[6] = _mm256_srai_epi32(u[6], 2); - u[7] = _mm256_srai_epi32(u[7], 2); - - out[4] = _mm256_packs_epi32(u[0], u[1]); - out[20] = _mm256_packs_epi32(u[2], u[3]); - out[12] = _mm256_packs_epi32(u[4], u[5]); - out[28] = _mm256_packs_epi32(u[6], u[7]); - } - { - lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]); - lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]); - lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]); - lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]); - lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]); - lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]); - lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]); - lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]); - lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]); - lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]); - lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]); - lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]); - lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]); - lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]); - lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]); - lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]); - } - { - const __m256i k32_m04_p28 = - pair256_set_epi32(-cospi_4_64, cospi_28_64); - const __m256i k32_m28_m04 = - pair256_set_epi32(-cospi_28_64, -cospi_4_64); - const __m256i k32_m20_p12 = - pair256_set_epi32(-cospi_20_64, cospi_12_64); - const __m256i k32_m12_m20 = - pair256_set_epi32(-cospi_12_64, -cospi_20_64); - const __m256i k32_p12_p20 = - pair256_set_epi32(cospi_12_64, cospi_20_64); - const __m256i k32_p28_p04 = - pair256_set_epi32(cospi_28_64, cospi_4_64); - - u[0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]); - u[1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]); - u[2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]); - u[3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]); - u[4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]); - u[5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]); - u[6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]); - u[7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]); - u[8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]); - u[9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]); - u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]); - u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]); - u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]); - u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]); - u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]); - u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]); - - v[0] = k_madd_epi32_avx2(u[0], k32_m04_p28); - v[1] = k_madd_epi32_avx2(u[1], k32_m04_p28); - v[2] = k_madd_epi32_avx2(u[2], k32_m04_p28); - v[3] = k_madd_epi32_avx2(u[3], k32_m04_p28); - v[4] = k_madd_epi32_avx2(u[4], k32_m28_m04); - v[5] = k_madd_epi32_avx2(u[5], k32_m28_m04); - v[6] = k_madd_epi32_avx2(u[6], k32_m28_m04); - v[7] = k_madd_epi32_avx2(u[7], k32_m28_m04); - v[8] = k_madd_epi32_avx2(u[8], k32_m20_p12); - v[9] = k_madd_epi32_avx2(u[9], k32_m20_p12); - v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12); - v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12); - v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20); - v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20); - v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20); - v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20); - v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12); - v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12); - v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12); - v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12); - v[20] = k_madd_epi32_avx2(u[8], k32_p12_p20); - v[21] = k_madd_epi32_avx2(u[9], k32_p12_p20); - v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20); - v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20); - v[24] = k_madd_epi32_avx2(u[4], k32_m04_p28); - v[25] = k_madd_epi32_avx2(u[5], k32_m04_p28); - v[26] = k_madd_epi32_avx2(u[6], k32_m04_p28); - v[27] = k_madd_epi32_avx2(u[7], k32_m04_p28); - v[28] = k_madd_epi32_avx2(u[0], k32_p28_p04); - v[29] = k_madd_epi32_avx2(u[1], k32_p28_p04); - v[30] = k_madd_epi32_avx2(u[2], k32_p28_p04); - v[31] = k_madd_epi32_avx2(u[3], k32_p28_p04); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - u[8] = k_packs_epi64_avx2(v[16], v[17]); - u[9] = k_packs_epi64_avx2(v[18], v[19]); - u[10] = k_packs_epi64_avx2(v[20], v[21]); - u[11] = k_packs_epi64_avx2(v[22], v[23]); - u[12] = k_packs_epi64_avx2(v[24], v[25]); - u[13] = k_packs_epi64_avx2(v[26], v[27]); - u[14] = k_packs_epi64_avx2(v[28], v[29]); - u[15] = k_packs_epi64_avx2(v[30], v[31]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - lstep3[34] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - lstep3[35] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - lstep3[36] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - lstep3[37] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - lstep3[42] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - lstep3[43] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - lstep3[44] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - lstep3[45] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - lstep3[50] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); - lstep3[51] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); - lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); - lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); - lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); - lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); - lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); - lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - } - // stage 7 - { - const __m256i k32_p30_p02 = - pair256_set_epi32(cospi_30_64, cospi_2_64); - const __m256i k32_p14_p18 = - pair256_set_epi32(cospi_14_64, cospi_18_64); - const __m256i k32_p22_p10 = - pair256_set_epi32(cospi_22_64, cospi_10_64); - const __m256i k32_p06_p26 = - pair256_set_epi32(cospi_6_64, cospi_26_64); - const __m256i k32_m26_p06 = - pair256_set_epi32(-cospi_26_64, cospi_6_64); - const __m256i k32_m10_p22 = - pair256_set_epi32(-cospi_10_64, cospi_22_64); - const __m256i k32_m18_p14 = - pair256_set_epi32(-cospi_18_64, cospi_14_64); - const __m256i k32_m02_p30 = - pair256_set_epi32(-cospi_2_64, cospi_30_64); - - u[0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]); - u[1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]); - u[2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]); - u[3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]); - u[4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]); - u[5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]); - u[6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]); - u[7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]); - u[8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]); - u[9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]); - u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]); - u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]); - u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]); - u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]); - u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]); - u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]); - - v[0] = k_madd_epi32_avx2(u[0], k32_p30_p02); - v[1] = k_madd_epi32_avx2(u[1], k32_p30_p02); - v[2] = k_madd_epi32_avx2(u[2], k32_p30_p02); - v[3] = k_madd_epi32_avx2(u[3], k32_p30_p02); - v[4] = k_madd_epi32_avx2(u[4], k32_p14_p18); - v[5] = k_madd_epi32_avx2(u[5], k32_p14_p18); - v[6] = k_madd_epi32_avx2(u[6], k32_p14_p18); - v[7] = k_madd_epi32_avx2(u[7], k32_p14_p18); - v[8] = k_madd_epi32_avx2(u[8], k32_p22_p10); - v[9] = k_madd_epi32_avx2(u[9], k32_p22_p10); - v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10); - v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10); - v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26); - v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26); - v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26); - v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26); - v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06); - v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06); - v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06); - v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06); - v[20] = k_madd_epi32_avx2(u[8], k32_m10_p22); - v[21] = k_madd_epi32_avx2(u[9], k32_m10_p22); - v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22); - v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22); - v[24] = k_madd_epi32_avx2(u[4], k32_m18_p14); - v[25] = k_madd_epi32_avx2(u[5], k32_m18_p14); - v[26] = k_madd_epi32_avx2(u[6], k32_m18_p14); - v[27] = k_madd_epi32_avx2(u[7], k32_m18_p14); - v[28] = k_madd_epi32_avx2(u[0], k32_m02_p30); - v[29] = k_madd_epi32_avx2(u[1], k32_m02_p30); - v[30] = k_madd_epi32_avx2(u[2], k32_m02_p30); - v[31] = k_madd_epi32_avx2(u[3], k32_m02_p30); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - u[8] = k_packs_epi64_avx2(v[16], v[17]); - u[9] = k_packs_epi64_avx2(v[18], v[19]); - u[10] = k_packs_epi64_avx2(v[20], v[21]); - u[11] = k_packs_epi64_avx2(v[22], v[23]); - u[12] = k_packs_epi64_avx2(v[24], v[25]); - u[13] = k_packs_epi64_avx2(v[26], v[27]); - u[14] = k_packs_epi64_avx2(v[28], v[29]); - u[15] = k_packs_epi64_avx2(v[30], v[31]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm256_cmpgt_epi32(kZero, u[0]); - v[1] = _mm256_cmpgt_epi32(kZero, u[1]); - v[2] = _mm256_cmpgt_epi32(kZero, u[2]); - v[3] = _mm256_cmpgt_epi32(kZero, u[3]); - v[4] = _mm256_cmpgt_epi32(kZero, u[4]); - v[5] = _mm256_cmpgt_epi32(kZero, u[5]); - v[6] = _mm256_cmpgt_epi32(kZero, u[6]); - v[7] = _mm256_cmpgt_epi32(kZero, u[7]); - v[8] = _mm256_cmpgt_epi32(kZero, u[8]); - v[9] = _mm256_cmpgt_epi32(kZero, u[9]); - v[10] = _mm256_cmpgt_epi32(kZero, u[10]); - v[11] = _mm256_cmpgt_epi32(kZero, u[11]); - v[12] = _mm256_cmpgt_epi32(kZero, u[12]); - v[13] = _mm256_cmpgt_epi32(kZero, u[13]); - v[14] = _mm256_cmpgt_epi32(kZero, u[14]); - v[15] = _mm256_cmpgt_epi32(kZero, u[15]); - - u[0] = _mm256_sub_epi32(u[0], v[0]); - u[1] = _mm256_sub_epi32(u[1], v[1]); - u[2] = _mm256_sub_epi32(u[2], v[2]); - u[3] = _mm256_sub_epi32(u[3], v[3]); - u[4] = _mm256_sub_epi32(u[4], v[4]); - u[5] = _mm256_sub_epi32(u[5], v[5]); - u[6] = _mm256_sub_epi32(u[6], v[6]); - u[7] = _mm256_sub_epi32(u[7], v[7]); - u[8] = _mm256_sub_epi32(u[8], v[8]); - u[9] = _mm256_sub_epi32(u[9], v[9]); - u[10] = _mm256_sub_epi32(u[10], v[10]); - u[11] = _mm256_sub_epi32(u[11], v[11]); - u[12] = _mm256_sub_epi32(u[12], v[12]); - u[13] = _mm256_sub_epi32(u[13], v[13]); - u[14] = _mm256_sub_epi32(u[14], v[14]); - u[15] = _mm256_sub_epi32(u[15], v[15]); - - v[0] = _mm256_add_epi32(u[0], K32One); - v[1] = _mm256_add_epi32(u[1], K32One); - v[2] = _mm256_add_epi32(u[2], K32One); - v[3] = _mm256_add_epi32(u[3], K32One); - v[4] = _mm256_add_epi32(u[4], K32One); - v[5] = _mm256_add_epi32(u[5], K32One); - v[6] = _mm256_add_epi32(u[6], K32One); - v[7] = _mm256_add_epi32(u[7], K32One); - v[8] = _mm256_add_epi32(u[8], K32One); - v[9] = _mm256_add_epi32(u[9], K32One); - v[10] = _mm256_add_epi32(u[10], K32One); - v[11] = _mm256_add_epi32(u[11], K32One); - v[12] = _mm256_add_epi32(u[12], K32One); - v[13] = _mm256_add_epi32(u[13], K32One); - v[14] = _mm256_add_epi32(u[14], K32One); - v[15] = _mm256_add_epi32(u[15], K32One); - - u[0] = _mm256_srai_epi32(v[0], 2); - u[1] = _mm256_srai_epi32(v[1], 2); - u[2] = _mm256_srai_epi32(v[2], 2); - u[3] = _mm256_srai_epi32(v[3], 2); - u[4] = _mm256_srai_epi32(v[4], 2); - u[5] = _mm256_srai_epi32(v[5], 2); - u[6] = _mm256_srai_epi32(v[6], 2); - u[7] = _mm256_srai_epi32(v[7], 2); - u[8] = _mm256_srai_epi32(v[8], 2); - u[9] = _mm256_srai_epi32(v[9], 2); - u[10] = _mm256_srai_epi32(v[10], 2); - u[11] = _mm256_srai_epi32(v[11], 2); - u[12] = _mm256_srai_epi32(v[12], 2); - u[13] = _mm256_srai_epi32(v[13], 2); - u[14] = _mm256_srai_epi32(v[14], 2); - u[15] = _mm256_srai_epi32(v[15], 2); - - out[2] = _mm256_packs_epi32(u[0], u[1]); - out[18] = _mm256_packs_epi32(u[2], u[3]); - out[10] = _mm256_packs_epi32(u[4], u[5]); - out[26] = _mm256_packs_epi32(u[6], u[7]); - out[6] = _mm256_packs_epi32(u[8], u[9]); - out[22] = _mm256_packs_epi32(u[10], u[11]); - out[14] = _mm256_packs_epi32(u[12], u[13]); - out[30] = _mm256_packs_epi32(u[14], u[15]); - } - { - lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]); - lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]); - lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]); - lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]); - lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]); - lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]); - lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]); - lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]); - lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]); - lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]); - lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]); - lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]); - lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]); - lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]); - lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]); - lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]); - lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]); - lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]); - lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]); - lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]); - lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]); - lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]); - lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]); - lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]); - lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]); - lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]); - lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]); - lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]); - lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]); - lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]); - lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]); - lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]); - } - // stage 8 - { - const __m256i k32_p31_p01 = - pair256_set_epi32(cospi_31_64, cospi_1_64); - const __m256i k32_p15_p17 = - pair256_set_epi32(cospi_15_64, cospi_17_64); - const __m256i k32_p23_p09 = - pair256_set_epi32(cospi_23_64, cospi_9_64); - const __m256i k32_p07_p25 = - pair256_set_epi32(cospi_7_64, cospi_25_64); - const __m256i k32_m25_p07 = - pair256_set_epi32(-cospi_25_64, cospi_7_64); - const __m256i k32_m09_p23 = - pair256_set_epi32(-cospi_9_64, cospi_23_64); - const __m256i k32_m17_p15 = - pair256_set_epi32(-cospi_17_64, cospi_15_64); - const __m256i k32_m01_p31 = - pair256_set_epi32(-cospi_1_64, cospi_31_64); - - u[0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]); - u[1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]); - u[2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]); - u[3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]); - u[4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]); - u[5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]); - u[6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]); - u[7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]); - u[8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]); - u[9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]); - u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]); - u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]); - u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]); - u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]); - u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]); - u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]); - - v[0] = k_madd_epi32_avx2(u[0], k32_p31_p01); - v[1] = k_madd_epi32_avx2(u[1], k32_p31_p01); - v[2] = k_madd_epi32_avx2(u[2], k32_p31_p01); - v[3] = k_madd_epi32_avx2(u[3], k32_p31_p01); - v[4] = k_madd_epi32_avx2(u[4], k32_p15_p17); - v[5] = k_madd_epi32_avx2(u[5], k32_p15_p17); - v[6] = k_madd_epi32_avx2(u[6], k32_p15_p17); - v[7] = k_madd_epi32_avx2(u[7], k32_p15_p17); - v[8] = k_madd_epi32_avx2(u[8], k32_p23_p09); - v[9] = k_madd_epi32_avx2(u[9], k32_p23_p09); - v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09); - v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09); - v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25); - v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25); - v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25); - v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25); - v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07); - v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07); - v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07); - v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07); - v[20] = k_madd_epi32_avx2(u[8], k32_m09_p23); - v[21] = k_madd_epi32_avx2(u[9], k32_m09_p23); - v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23); - v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23); - v[24] = k_madd_epi32_avx2(u[4], k32_m17_p15); - v[25] = k_madd_epi32_avx2(u[5], k32_m17_p15); - v[26] = k_madd_epi32_avx2(u[6], k32_m17_p15); - v[27] = k_madd_epi32_avx2(u[7], k32_m17_p15); - v[28] = k_madd_epi32_avx2(u[0], k32_m01_p31); - v[29] = k_madd_epi32_avx2(u[1], k32_m01_p31); - v[30] = k_madd_epi32_avx2(u[2], k32_m01_p31); - v[31] = k_madd_epi32_avx2(u[3], k32_m01_p31); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - u[8] = k_packs_epi64_avx2(v[16], v[17]); - u[9] = k_packs_epi64_avx2(v[18], v[19]); - u[10] = k_packs_epi64_avx2(v[20], v[21]); - u[11] = k_packs_epi64_avx2(v[22], v[23]); - u[12] = k_packs_epi64_avx2(v[24], v[25]); - u[13] = k_packs_epi64_avx2(v[26], v[27]); - u[14] = k_packs_epi64_avx2(v[28], v[29]); - u[15] = k_packs_epi64_avx2(v[30], v[31]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm256_cmpgt_epi32(kZero, u[0]); - v[1] = _mm256_cmpgt_epi32(kZero, u[1]); - v[2] = _mm256_cmpgt_epi32(kZero, u[2]); - v[3] = _mm256_cmpgt_epi32(kZero, u[3]); - v[4] = _mm256_cmpgt_epi32(kZero, u[4]); - v[5] = _mm256_cmpgt_epi32(kZero, u[5]); - v[6] = _mm256_cmpgt_epi32(kZero, u[6]); - v[7] = _mm256_cmpgt_epi32(kZero, u[7]); - v[8] = _mm256_cmpgt_epi32(kZero, u[8]); - v[9] = _mm256_cmpgt_epi32(kZero, u[9]); - v[10] = _mm256_cmpgt_epi32(kZero, u[10]); - v[11] = _mm256_cmpgt_epi32(kZero, u[11]); - v[12] = _mm256_cmpgt_epi32(kZero, u[12]); - v[13] = _mm256_cmpgt_epi32(kZero, u[13]); - v[14] = _mm256_cmpgt_epi32(kZero, u[14]); - v[15] = _mm256_cmpgt_epi32(kZero, u[15]); - - u[0] = _mm256_sub_epi32(u[0], v[0]); - u[1] = _mm256_sub_epi32(u[1], v[1]); - u[2] = _mm256_sub_epi32(u[2], v[2]); - u[3] = _mm256_sub_epi32(u[3], v[3]); - u[4] = _mm256_sub_epi32(u[4], v[4]); - u[5] = _mm256_sub_epi32(u[5], v[5]); - u[6] = _mm256_sub_epi32(u[6], v[6]); - u[7] = _mm256_sub_epi32(u[7], v[7]); - u[8] = _mm256_sub_epi32(u[8], v[8]); - u[9] = _mm256_sub_epi32(u[9], v[9]); - u[10] = _mm256_sub_epi32(u[10], v[10]); - u[11] = _mm256_sub_epi32(u[11], v[11]); - u[12] = _mm256_sub_epi32(u[12], v[12]); - u[13] = _mm256_sub_epi32(u[13], v[13]); - u[14] = _mm256_sub_epi32(u[14], v[14]); - u[15] = _mm256_sub_epi32(u[15], v[15]); - - v[0] = _mm256_add_epi32(u[0], K32One); - v[1] = _mm256_add_epi32(u[1], K32One); - v[2] = _mm256_add_epi32(u[2], K32One); - v[3] = _mm256_add_epi32(u[3], K32One); - v[4] = _mm256_add_epi32(u[4], K32One); - v[5] = _mm256_add_epi32(u[5], K32One); - v[6] = _mm256_add_epi32(u[6], K32One); - v[7] = _mm256_add_epi32(u[7], K32One); - v[8] = _mm256_add_epi32(u[8], K32One); - v[9] = _mm256_add_epi32(u[9], K32One); - v[10] = _mm256_add_epi32(u[10], K32One); - v[11] = _mm256_add_epi32(u[11], K32One); - v[12] = _mm256_add_epi32(u[12], K32One); - v[13] = _mm256_add_epi32(u[13], K32One); - v[14] = _mm256_add_epi32(u[14], K32One); - v[15] = _mm256_add_epi32(u[15], K32One); - - u[0] = _mm256_srai_epi32(v[0], 2); - u[1] = _mm256_srai_epi32(v[1], 2); - u[2] = _mm256_srai_epi32(v[2], 2); - u[3] = _mm256_srai_epi32(v[3], 2); - u[4] = _mm256_srai_epi32(v[4], 2); - u[5] = _mm256_srai_epi32(v[5], 2); - u[6] = _mm256_srai_epi32(v[6], 2); - u[7] = _mm256_srai_epi32(v[7], 2); - u[8] = _mm256_srai_epi32(v[8], 2); - u[9] = _mm256_srai_epi32(v[9], 2); - u[10] = _mm256_srai_epi32(v[10], 2); - u[11] = _mm256_srai_epi32(v[11], 2); - u[12] = _mm256_srai_epi32(v[12], 2); - u[13] = _mm256_srai_epi32(v[13], 2); - u[14] = _mm256_srai_epi32(v[14], 2); - u[15] = _mm256_srai_epi32(v[15], 2); - - out[1] = _mm256_packs_epi32(u[0], u[1]); - out[17] = _mm256_packs_epi32(u[2], u[3]); - out[9] = _mm256_packs_epi32(u[4], u[5]); - out[25] = _mm256_packs_epi32(u[6], u[7]); - out[7] = _mm256_packs_epi32(u[8], u[9]); - out[23] = _mm256_packs_epi32(u[10], u[11]); - out[15] = _mm256_packs_epi32(u[12], u[13]); - out[31] = _mm256_packs_epi32(u[14], u[15]); - } - { - const __m256i k32_p27_p05 = - pair256_set_epi32(cospi_27_64, cospi_5_64); - const __m256i k32_p11_p21 = - pair256_set_epi32(cospi_11_64, cospi_21_64); - const __m256i k32_p19_p13 = - pair256_set_epi32(cospi_19_64, cospi_13_64); - const __m256i k32_p03_p29 = - pair256_set_epi32(cospi_3_64, cospi_29_64); - const __m256i k32_m29_p03 = - pair256_set_epi32(-cospi_29_64, cospi_3_64); - const __m256i k32_m13_p19 = - pair256_set_epi32(-cospi_13_64, cospi_19_64); - const __m256i k32_m21_p11 = - pair256_set_epi32(-cospi_21_64, cospi_11_64); - const __m256i k32_m05_p27 = - pair256_set_epi32(-cospi_5_64, cospi_27_64); - - u[0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]); - u[1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]); - u[2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]); - u[3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]); - u[4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]); - u[5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]); - u[6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]); - u[7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]); - u[8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]); - u[9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]); - u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]); - u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]); - u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]); - u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]); - u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]); - u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]); - - v[0] = k_madd_epi32_avx2(u[0], k32_p27_p05); - v[1] = k_madd_epi32_avx2(u[1], k32_p27_p05); - v[2] = k_madd_epi32_avx2(u[2], k32_p27_p05); - v[3] = k_madd_epi32_avx2(u[3], k32_p27_p05); - v[4] = k_madd_epi32_avx2(u[4], k32_p11_p21); - v[5] = k_madd_epi32_avx2(u[5], k32_p11_p21); - v[6] = k_madd_epi32_avx2(u[6], k32_p11_p21); - v[7] = k_madd_epi32_avx2(u[7], k32_p11_p21); - v[8] = k_madd_epi32_avx2(u[8], k32_p19_p13); - v[9] = k_madd_epi32_avx2(u[9], k32_p19_p13); - v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13); - v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13); - v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29); - v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29); - v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29); - v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29); - v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03); - v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03); - v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03); - v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03); - v[20] = k_madd_epi32_avx2(u[8], k32_m13_p19); - v[21] = k_madd_epi32_avx2(u[9], k32_m13_p19); - v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19); - v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19); - v[24] = k_madd_epi32_avx2(u[4], k32_m21_p11); - v[25] = k_madd_epi32_avx2(u[5], k32_m21_p11); - v[26] = k_madd_epi32_avx2(u[6], k32_m21_p11); - v[27] = k_madd_epi32_avx2(u[7], k32_m21_p11); - v[28] = k_madd_epi32_avx2(u[0], k32_m05_p27); - v[29] = k_madd_epi32_avx2(u[1], k32_m05_p27); - v[30] = k_madd_epi32_avx2(u[2], k32_m05_p27); - v[31] = k_madd_epi32_avx2(u[3], k32_m05_p27); - - u[0] = k_packs_epi64_avx2(v[0], v[1]); - u[1] = k_packs_epi64_avx2(v[2], v[3]); - u[2] = k_packs_epi64_avx2(v[4], v[5]); - u[3] = k_packs_epi64_avx2(v[6], v[7]); - u[4] = k_packs_epi64_avx2(v[8], v[9]); - u[5] = k_packs_epi64_avx2(v[10], v[11]); - u[6] = k_packs_epi64_avx2(v[12], v[13]); - u[7] = k_packs_epi64_avx2(v[14], v[15]); - u[8] = k_packs_epi64_avx2(v[16], v[17]); - u[9] = k_packs_epi64_avx2(v[18], v[19]); - u[10] = k_packs_epi64_avx2(v[20], v[21]); - u[11] = k_packs_epi64_avx2(v[22], v[23]); - u[12] = k_packs_epi64_avx2(v[24], v[25]); - u[13] = k_packs_epi64_avx2(v[26], v[27]); - u[14] = k_packs_epi64_avx2(v[28], v[29]); - u[15] = k_packs_epi64_avx2(v[30], v[31]); - - v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm256_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm256_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm256_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm256_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm256_cmpgt_epi32(kZero, u[0]); - v[1] = _mm256_cmpgt_epi32(kZero, u[1]); - v[2] = _mm256_cmpgt_epi32(kZero, u[2]); - v[3] = _mm256_cmpgt_epi32(kZero, u[3]); - v[4] = _mm256_cmpgt_epi32(kZero, u[4]); - v[5] = _mm256_cmpgt_epi32(kZero, u[5]); - v[6] = _mm256_cmpgt_epi32(kZero, u[6]); - v[7] = _mm256_cmpgt_epi32(kZero, u[7]); - v[8] = _mm256_cmpgt_epi32(kZero, u[8]); - v[9] = _mm256_cmpgt_epi32(kZero, u[9]); - v[10] = _mm256_cmpgt_epi32(kZero, u[10]); - v[11] = _mm256_cmpgt_epi32(kZero, u[11]); - v[12] = _mm256_cmpgt_epi32(kZero, u[12]); - v[13] = _mm256_cmpgt_epi32(kZero, u[13]); - v[14] = _mm256_cmpgt_epi32(kZero, u[14]); - v[15] = _mm256_cmpgt_epi32(kZero, u[15]); - - u[0] = _mm256_sub_epi32(u[0], v[0]); - u[1] = _mm256_sub_epi32(u[1], v[1]); - u[2] = _mm256_sub_epi32(u[2], v[2]); - u[3] = _mm256_sub_epi32(u[3], v[3]); - u[4] = _mm256_sub_epi32(u[4], v[4]); - u[5] = _mm256_sub_epi32(u[5], v[5]); - u[6] = _mm256_sub_epi32(u[6], v[6]); - u[7] = _mm256_sub_epi32(u[7], v[7]); - u[8] = _mm256_sub_epi32(u[8], v[8]); - u[9] = _mm256_sub_epi32(u[9], v[9]); - u[10] = _mm256_sub_epi32(u[10], v[10]); - u[11] = _mm256_sub_epi32(u[11], v[11]); - u[12] = _mm256_sub_epi32(u[12], v[12]); - u[13] = _mm256_sub_epi32(u[13], v[13]); - u[14] = _mm256_sub_epi32(u[14], v[14]); - u[15] = _mm256_sub_epi32(u[15], v[15]); - - v[0] = _mm256_add_epi32(u[0], K32One); - v[1] = _mm256_add_epi32(u[1], K32One); - v[2] = _mm256_add_epi32(u[2], K32One); - v[3] = _mm256_add_epi32(u[3], K32One); - v[4] = _mm256_add_epi32(u[4], K32One); - v[5] = _mm256_add_epi32(u[5], K32One); - v[6] = _mm256_add_epi32(u[6], K32One); - v[7] = _mm256_add_epi32(u[7], K32One); - v[8] = _mm256_add_epi32(u[8], K32One); - v[9] = _mm256_add_epi32(u[9], K32One); - v[10] = _mm256_add_epi32(u[10], K32One); - v[11] = _mm256_add_epi32(u[11], K32One); - v[12] = _mm256_add_epi32(u[12], K32One); - v[13] = _mm256_add_epi32(u[13], K32One); - v[14] = _mm256_add_epi32(u[14], K32One); - v[15] = _mm256_add_epi32(u[15], K32One); - - u[0] = _mm256_srai_epi32(v[0], 2); - u[1] = _mm256_srai_epi32(v[1], 2); - u[2] = _mm256_srai_epi32(v[2], 2); - u[3] = _mm256_srai_epi32(v[3], 2); - u[4] = _mm256_srai_epi32(v[4], 2); - u[5] = _mm256_srai_epi32(v[5], 2); - u[6] = _mm256_srai_epi32(v[6], 2); - u[7] = _mm256_srai_epi32(v[7], 2); - u[8] = _mm256_srai_epi32(v[8], 2); - u[9] = _mm256_srai_epi32(v[9], 2); - u[10] = _mm256_srai_epi32(v[10], 2); - u[11] = _mm256_srai_epi32(v[11], 2); - u[12] = _mm256_srai_epi32(v[12], 2); - u[13] = _mm256_srai_epi32(v[13], 2); - u[14] = _mm256_srai_epi32(v[14], 2); - u[15] = _mm256_srai_epi32(v[15], 2); - - out[5] = _mm256_packs_epi32(u[0], u[1]); - out[21] = _mm256_packs_epi32(u[2], u[3]); - out[13] = _mm256_packs_epi32(u[4], u[5]); - out[29] = _mm256_packs_epi32(u[6], u[7]); - out[3] = _mm256_packs_epi32(u[8], u[9]); - out[19] = _mm256_packs_epi32(u[10], u[11]); - out[11] = _mm256_packs_epi32(u[12], u[13]); - out[27] = _mm256_packs_epi32(u[14], u[15]); - } - } -#endif - // Transpose the results, do it as four 8x8 transposes. - { - int transpose_block; - int16_t *output_currStep, *output_nextStep; - tran_low_t *curr_out, *next_out; - // Pass 0 - output_currStep = &intermediate[column_start * 32]; - output_nextStep = &intermediate[(column_start + 8) * 32]; - // Pass 1 - curr_out = &output_org[column_start * 32]; - next_out = &output_org[(column_start + 8) * 32]; - - for (transpose_block = 0; transpose_block < 4; ++transpose_block) { - __m256i *this_out = &out[8 * transpose_block]; - // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - // 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 - // 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 - // 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 - // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 - // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 - // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 - // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 - const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]); - const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]); - const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]); - const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]); - const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]); - const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]); - const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]); - const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]); - // 00 20 01 21 02 22 03 23 08 28 09 29 10 30 11 31 - // 40 60 41 61 42 62 43 63 48 68 49 69 50 70 51 71 - // 04 24 05 25 06 26 07 27 12 32 13 33 14 34 15 35 - // 44 64 45 65 46 66 47 67 52 72 53 73 54 74 55 75 - // 80 100 81 101 82 102 83 103 88 108 89 109 90 110 91 101 - // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151 - // 84 104 85 105 86 106 87 107 92 112 93 113 94 114 95 115 - // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155 - - const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); - const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3); - const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); - const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3); - const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5); - const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); - const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5); - const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); - // 00 20 40 60 01 21 41 61 08 28 48 68 09 29 49 69 - // 04 24 44 64 05 25 45 65 12 32 52 72 13 33 53 73 - // 02 22 42 62 03 23 43 63 10 30 50 70 11 31 51 71 - // 06 26 46 66 07 27 47 67 14 34 54 74 15 35 55 75 - // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149 - // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153 - // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151 - // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155 - __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4); - __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4); - __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6); - __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6); - __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5); - __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5); - __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7); - __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7); - // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148 - // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149 - // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150 - // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151 - // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152 - // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153 - // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154 - // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155 - if (0 == pass) { - // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; - // TODO(cd): see quality impact of only doing - // output[j] = (output[j] + 1) >> 2; - // which would remove the code between here ... - __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero); - __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero); - __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero); - __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero); - __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero); - __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero); - __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero); - __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero); - tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0); - tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0); - tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0); - tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0); - tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0); - tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0); - tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0); - tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0); - // ... and here. - // PS: also change code in av1/encoder/av1_dct.c - tr2_0 = _mm256_add_epi16(tr2_0, kOne); - tr2_1 = _mm256_add_epi16(tr2_1, kOne); - tr2_2 = _mm256_add_epi16(tr2_2, kOne); - tr2_3 = _mm256_add_epi16(tr2_3, kOne); - tr2_4 = _mm256_add_epi16(tr2_4, kOne); - tr2_5 = _mm256_add_epi16(tr2_5, kOne); - tr2_6 = _mm256_add_epi16(tr2_6, kOne); - tr2_7 = _mm256_add_epi16(tr2_7, kOne); - tr2_0 = _mm256_srai_epi16(tr2_0, 2); - tr2_1 = _mm256_srai_epi16(tr2_1, 2); - tr2_2 = _mm256_srai_epi16(tr2_2, 2); - tr2_3 = _mm256_srai_epi16(tr2_3, 2); - tr2_4 = _mm256_srai_epi16(tr2_4, 2); - tr2_5 = _mm256_srai_epi16(tr2_5, 2); - tr2_6 = _mm256_srai_epi16(tr2_6, 2); - tr2_7 = _mm256_srai_epi16(tr2_7, 2); - } - if (0 == pass) { - // Note: even though all these stores are aligned, using the aligned - // intrinsic make the code slightly slower. - _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), - _mm256_castsi256_si128(tr2_0)); - _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), - _mm256_castsi256_si128(tr2_1)); - _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), - _mm256_castsi256_si128(tr2_2)); - _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), - _mm256_castsi256_si128(tr2_3)); - _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), - _mm256_castsi256_si128(tr2_4)); - _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), - _mm256_castsi256_si128(tr2_5)); - _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), - _mm256_castsi256_si128(tr2_6)); - _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), - _mm256_castsi256_si128(tr2_7)); - - _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), - _mm256_extractf128_si256(tr2_0, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), - _mm256_extractf128_si256(tr2_1, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), - _mm256_extractf128_si256(tr2_2, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), - _mm256_extractf128_si256(tr2_3, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), - _mm256_extractf128_si256(tr2_4, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), - _mm256_extractf128_si256(tr2_5, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), - _mm256_extractf128_si256(tr2_6, 1)); - _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), - _mm256_extractf128_si256(tr2_7, 1)); - // Process next 8x8 - output_currStep += 8; - output_nextStep += 8; - } - if (1 == pass) { - store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32); - store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32); - store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32); - store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32); - store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32); - store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32); - store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32); - store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32); - curr_out += 8; - next_out += 8; - } - } - } - } - } - _mm256_zeroupper(); -} // NOLINT diff --git a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h deleted file mode 100644 index 69dd6af11..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_dct32x32_impl_sse2.h +++ /dev/null @@ -1,3201 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include // SSE2 - -#include "aom_dsp/fwd_txfm.h" -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -// TODO(jingning) The high bit-depth version needs re-work for performance. -// The current SSE2 implementation also causes cross reference to the static -// functions in the C implementation file. -#if DCT_HIGH_BIT_DEPTH -#define ADD_EPI16 _mm_adds_epi16 -#define SUB_EPI16 _mm_subs_epi16 -#if FDCT32x32_HIGH_PRECISION -void aom_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) { - int i, j; - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; - aom_fdct32(temp_in, temp_out, 0); - for (j = 0; j < 32; ++j) - out[j + i * 32] = - (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); - } -} -#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_c -#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rows_c -#else -void aom_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) { - int i, j; - for (i = 0; i < 32; ++i) { - tran_high_t temp_in[32], temp_out[32]; - for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i]; - aom_fdct32(temp_in, temp_out, 1); - for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j]; - } -} -#define HIGH_FDCT32x32_2D_C aom_highbd_fdct32x32_rd_c -#define HIGH_FDCT32x32_2D_ROWS_C aom_fdct32x32_rd_rows_c -#endif // FDCT32x32_HIGH_PRECISION -#else -#define ADD_EPI16 _mm_add_epi16 -#define SUB_EPI16 _mm_sub_epi16 -#endif // DCT_HIGH_BIT_DEPTH - -void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) { - // Calculate pre-multiplied strides - const int str1 = stride; - const int str2 = 2 * stride; - const int str3 = 2 * stride + str1; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); - const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); - const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); - const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); - const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); - const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); - const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); - const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); - const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); - const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); - const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); - const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); - const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); - const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); - const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); - const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); - const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i kOne = _mm_set1_epi16(1); - // Do the two transform/transpose passes - int pass; -#if DCT_HIGH_BIT_DEPTH - int overflow; -#endif - for (pass = 0; pass < 2; ++pass) { - // We process eight columns (transposed rows in second pass) at a time. - int column_start; - for (column_start = 0; column_start < 32; column_start += 8) { - __m128i step1[32]; - __m128i step2[32]; - __m128i step3[32]; - __m128i out[32]; - // Stage 1 - // Note: even though all the loads below are aligned, using the aligned - // intrinsic make the code slightly slower. - if (0 == pass) { - const int16_t *in = &input[column_start]; - // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - const int16_t *ina = in + 0 * str1; - const int16_t *inb = in + 31 * str1; - __m128i *step1a = &step1[0]; - __m128i *step1b = &step1[31]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 4 * str1; - const int16_t *inb = in + 27 * str1; - __m128i *step1a = &step1[4]; - __m128i *step1b = &step1[27]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 8 * str1; - const int16_t *inb = in + 23 * str1; - __m128i *step1a = &step1[8]; - __m128i *step1b = &step1[23]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - { - const int16_t *ina = in + 12 * str1; - const int16_t *inb = in + 19 * str1; - __m128i *step1a = &step1[12]; - __m128i *step1b = &step1[19]; - const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); - const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); - const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); - const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); - const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); - const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); - const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); - const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); - step1a[0] = _mm_add_epi16(ina0, inb0); - step1a[1] = _mm_add_epi16(ina1, inb1); - step1a[2] = _mm_add_epi16(ina2, inb2); - step1a[3] = _mm_add_epi16(ina3, inb3); - step1b[-3] = _mm_sub_epi16(ina3, inb3); - step1b[-2] = _mm_sub_epi16(ina2, inb2); - step1b[-1] = _mm_sub_epi16(ina1, inb1); - step1b[-0] = _mm_sub_epi16(ina0, inb0); - step1a[0] = _mm_slli_epi16(step1a[0], 2); - step1a[1] = _mm_slli_epi16(step1a[1], 2); - step1a[2] = _mm_slli_epi16(step1a[2], 2); - step1a[3] = _mm_slli_epi16(step1a[3], 2); - step1b[-3] = _mm_slli_epi16(step1b[-3], 2); - step1b[-2] = _mm_slli_epi16(step1b[-2], 2); - step1b[-1] = _mm_slli_epi16(step1b[-1], 2); - step1b[-0] = _mm_slli_epi16(step1b[-0], 2); - } - } else { - int16_t *in = &intermediate[column_start]; - // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; - // Note: using the same approach as above to have common offset is - // counter-productive as all offsets can be calculated at compile - // time. - // Note: the next four blocks could be in a loop. That would help the - // instruction cache but is actually slower. - { - __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); - __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); - __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); - __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); - __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); - __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); - __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); - __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); - step1[0] = ADD_EPI16(in00, in31); - step1[1] = ADD_EPI16(in01, in30); - step1[2] = ADD_EPI16(in02, in29); - step1[3] = ADD_EPI16(in03, in28); - step1[28] = SUB_EPI16(in03, in28); - step1[29] = SUB_EPI16(in02, in29); - step1[30] = SUB_EPI16(in01, in30); - step1[31] = SUB_EPI16(in00, in31); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2], - &step1[3], &step1[28], &step1[29], - &step1[30], &step1[31]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); - __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); - __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); - __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); - __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); - __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); - __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); - __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); - step1[4] = ADD_EPI16(in04, in27); - step1[5] = ADD_EPI16(in05, in26); - step1[6] = ADD_EPI16(in06, in25); - step1[7] = ADD_EPI16(in07, in24); - step1[24] = SUB_EPI16(in07, in24); - step1[25] = SUB_EPI16(in06, in25); - step1[26] = SUB_EPI16(in05, in26); - step1[27] = SUB_EPI16(in04, in27); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6], - &step1[7], &step1[24], &step1[25], - &step1[26], &step1[27]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); - __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); - __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); - __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); - __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); - __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); - __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); - __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); - step1[8] = ADD_EPI16(in08, in23); - step1[9] = ADD_EPI16(in09, in22); - step1[10] = ADD_EPI16(in10, in21); - step1[11] = ADD_EPI16(in11, in20); - step1[20] = SUB_EPI16(in11, in20); - step1[21] = SUB_EPI16(in10, in21); - step1[22] = SUB_EPI16(in09, in22); - step1[23] = SUB_EPI16(in08, in23); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10], - &step1[11], &step1[20], &step1[21], - &step1[22], &step1[23]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); - __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); - __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); - __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); - __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); - __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); - __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); - __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); - step1[12] = ADD_EPI16(in12, in19); - step1[13] = ADD_EPI16(in13, in18); - step1[14] = ADD_EPI16(in14, in17); - step1[15] = ADD_EPI16(in15, in16); - step1[16] = SUB_EPI16(in15, in16); - step1[17] = SUB_EPI16(in14, in17); - step1[18] = SUB_EPI16(in13, in18); - step1[19] = SUB_EPI16(in12, in19); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14], - &step1[15], &step1[16], &step1[17], - &step1[18], &step1[19]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Stage 2 - { - step2[0] = ADD_EPI16(step1[0], step1[15]); - step2[1] = ADD_EPI16(step1[1], step1[14]); - step2[2] = ADD_EPI16(step1[2], step1[13]); - step2[3] = ADD_EPI16(step1[3], step1[12]); - step2[4] = ADD_EPI16(step1[4], step1[11]); - step2[5] = ADD_EPI16(step1[5], step1[10]); - step2[6] = ADD_EPI16(step1[6], step1[9]); - step2[7] = ADD_EPI16(step1[7], step1[8]); - step2[8] = SUB_EPI16(step1[7], step1[8]); - step2[9] = SUB_EPI16(step1[6], step1[9]); - step2[10] = SUB_EPI16(step1[5], step1[10]); - step2[11] = SUB_EPI16(step1[4], step1[11]); - step2[12] = SUB_EPI16(step1[3], step1[12]); - step2[13] = SUB_EPI16(step1[2], step1[13]); - step2[14] = SUB_EPI16(step1[1], step1[14]); - step2[15] = SUB_EPI16(step1[0], step1[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], - &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], - &step2[12], &step2[13], &step2[14], &step2[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); - const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); - const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); - const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); - const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); - const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); - const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); - const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); - const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); - const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); - const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); - const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); - const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); - const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); - const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); - const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); - const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); - const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); - const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); - const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); - const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); - const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); - const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); - const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); - const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); - const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); - const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); - const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); - const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); - const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); - const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); - const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); - const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); - const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); - const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); - const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); - const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); - const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); - const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); - const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); - const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); - const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); - const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); - const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); - const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); - const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); - const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); - const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); - const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); - const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); - const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); - const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); - const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); - const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); - const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); - // Combine - step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); - step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); - step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); - step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); - step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); - step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); - step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); - step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22], - &step2[23], &step2[24], &step2[25], - &step2[26], &step2[27]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - -#if !FDCT32x32_HIGH_PRECISION - // dump the magnitude by half, hence the intermediate values are within - // the range of 16 bits. - if (1 == pass) { - __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero); - __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero); - __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero); - __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero); - __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero); - __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero); - __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero); - __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero); - __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero); - __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero); - __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); - __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); - __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); - __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero); - __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); - __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); - __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero); - __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero); - __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero); - __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero); - __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero); - __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero); - __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero); - __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero); - __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero); - __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero); - __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero); - __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero); - __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero); - __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero); - __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); - __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); - - step2[0] = SUB_EPI16(step2[0], s3_00_0); - step2[1] = SUB_EPI16(step2[1], s3_01_0); - step2[2] = SUB_EPI16(step2[2], s3_02_0); - step2[3] = SUB_EPI16(step2[3], s3_03_0); - step2[4] = SUB_EPI16(step2[4], s3_04_0); - step2[5] = SUB_EPI16(step2[5], s3_05_0); - step2[6] = SUB_EPI16(step2[6], s3_06_0); - step2[7] = SUB_EPI16(step2[7], s3_07_0); - step2[8] = SUB_EPI16(step2[8], s2_08_0); - step2[9] = SUB_EPI16(step2[9], s2_09_0); - step2[10] = SUB_EPI16(step2[10], s3_10_0); - step2[11] = SUB_EPI16(step2[11], s3_11_0); - step2[12] = SUB_EPI16(step2[12], s3_12_0); - step2[13] = SUB_EPI16(step2[13], s3_13_0); - step2[14] = SUB_EPI16(step2[14], s2_14_0); - step2[15] = SUB_EPI16(step2[15], s2_15_0); - step1[16] = SUB_EPI16(step1[16], s3_16_0); - step1[17] = SUB_EPI16(step1[17], s3_17_0); - step1[18] = SUB_EPI16(step1[18], s3_18_0); - step1[19] = SUB_EPI16(step1[19], s3_19_0); - step2[20] = SUB_EPI16(step2[20], s3_20_0); - step2[21] = SUB_EPI16(step2[21], s3_21_0); - step2[22] = SUB_EPI16(step2[22], s3_22_0); - step2[23] = SUB_EPI16(step2[23], s3_23_0); - step2[24] = SUB_EPI16(step2[24], s3_24_0); - step2[25] = SUB_EPI16(step2[25], s3_25_0); - step2[26] = SUB_EPI16(step2[26], s3_26_0); - step2[27] = SUB_EPI16(step2[27], s3_27_0); - step1[28] = SUB_EPI16(step1[28], s3_28_0); - step1[29] = SUB_EPI16(step1[29], s3_29_0); - step1[30] = SUB_EPI16(step1[30], s3_30_0); - step1[31] = SUB_EPI16(step1[31], s3_31_0); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x32( - &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5], - &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11], - &step2[12], &step2[13], &step2[14], &step2[15], &step1[16], - &step1[17], &step1[18], &step1[19], &step2[20], &step2[21], - &step2[22], &step2[23], &step2[24], &step2[25], &step2[26], - &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - step2[0] = _mm_add_epi16(step2[0], kOne); - step2[1] = _mm_add_epi16(step2[1], kOne); - step2[2] = _mm_add_epi16(step2[2], kOne); - step2[3] = _mm_add_epi16(step2[3], kOne); - step2[4] = _mm_add_epi16(step2[4], kOne); - step2[5] = _mm_add_epi16(step2[5], kOne); - step2[6] = _mm_add_epi16(step2[6], kOne); - step2[7] = _mm_add_epi16(step2[7], kOne); - step2[8] = _mm_add_epi16(step2[8], kOne); - step2[9] = _mm_add_epi16(step2[9], kOne); - step2[10] = _mm_add_epi16(step2[10], kOne); - step2[11] = _mm_add_epi16(step2[11], kOne); - step2[12] = _mm_add_epi16(step2[12], kOne); - step2[13] = _mm_add_epi16(step2[13], kOne); - step2[14] = _mm_add_epi16(step2[14], kOne); - step2[15] = _mm_add_epi16(step2[15], kOne); - step1[16] = _mm_add_epi16(step1[16], kOne); - step1[17] = _mm_add_epi16(step1[17], kOne); - step1[18] = _mm_add_epi16(step1[18], kOne); - step1[19] = _mm_add_epi16(step1[19], kOne); - step2[20] = _mm_add_epi16(step2[20], kOne); - step2[21] = _mm_add_epi16(step2[21], kOne); - step2[22] = _mm_add_epi16(step2[22], kOne); - step2[23] = _mm_add_epi16(step2[23], kOne); - step2[24] = _mm_add_epi16(step2[24], kOne); - step2[25] = _mm_add_epi16(step2[25], kOne); - step2[26] = _mm_add_epi16(step2[26], kOne); - step2[27] = _mm_add_epi16(step2[27], kOne); - step1[28] = _mm_add_epi16(step1[28], kOne); - step1[29] = _mm_add_epi16(step1[29], kOne); - step1[30] = _mm_add_epi16(step1[30], kOne); - step1[31] = _mm_add_epi16(step1[31], kOne); - - step2[0] = _mm_srai_epi16(step2[0], 2); - step2[1] = _mm_srai_epi16(step2[1], 2); - step2[2] = _mm_srai_epi16(step2[2], 2); - step2[3] = _mm_srai_epi16(step2[3], 2); - step2[4] = _mm_srai_epi16(step2[4], 2); - step2[5] = _mm_srai_epi16(step2[5], 2); - step2[6] = _mm_srai_epi16(step2[6], 2); - step2[7] = _mm_srai_epi16(step2[7], 2); - step2[8] = _mm_srai_epi16(step2[8], 2); - step2[9] = _mm_srai_epi16(step2[9], 2); - step2[10] = _mm_srai_epi16(step2[10], 2); - step2[11] = _mm_srai_epi16(step2[11], 2); - step2[12] = _mm_srai_epi16(step2[12], 2); - step2[13] = _mm_srai_epi16(step2[13], 2); - step2[14] = _mm_srai_epi16(step2[14], 2); - step2[15] = _mm_srai_epi16(step2[15], 2); - step1[16] = _mm_srai_epi16(step1[16], 2); - step1[17] = _mm_srai_epi16(step1[17], 2); - step1[18] = _mm_srai_epi16(step1[18], 2); - step1[19] = _mm_srai_epi16(step1[19], 2); - step2[20] = _mm_srai_epi16(step2[20], 2); - step2[21] = _mm_srai_epi16(step2[21], 2); - step2[22] = _mm_srai_epi16(step2[22], 2); - step2[23] = _mm_srai_epi16(step2[23], 2); - step2[24] = _mm_srai_epi16(step2[24], 2); - step2[25] = _mm_srai_epi16(step2[25], 2); - step2[26] = _mm_srai_epi16(step2[26], 2); - step2[27] = _mm_srai_epi16(step2[27], 2); - step1[28] = _mm_srai_epi16(step1[28], 2); - step1[29] = _mm_srai_epi16(step1[29], 2); - step1[30] = _mm_srai_epi16(step1[30], 2); - step1[31] = _mm_srai_epi16(step1[31], 2); - } -#endif // !FDCT32x32_HIGH_PRECISION - -#if FDCT32x32_HIGH_PRECISION - if (pass == 0) { -#endif - // Stage 3 - { - step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]); - step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]); - step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]); - step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]); - step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]); - step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]); - step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]); - step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2], - &step3[3], &step3[4], &step3[5], - &step3[6], &step3[7]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); - const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); - const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); - const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); - const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); - const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); - const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); - const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); - const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); - const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); - const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); - const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); - // Combine - step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); - step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); - step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); - step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12], - &step3[13]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step3[16] = ADD_EPI16(step2[23], step1[16]); - step3[17] = ADD_EPI16(step2[22], step1[17]); - step3[18] = ADD_EPI16(step2[21], step1[18]); - step3[19] = ADD_EPI16(step2[20], step1[19]); - step3[20] = SUB_EPI16(step1[19], step2[20]); - step3[21] = SUB_EPI16(step1[18], step2[21]); - step3[22] = SUB_EPI16(step1[17], step2[22]); - step3[23] = SUB_EPI16(step1[16], step2[23]); - step3[24] = SUB_EPI16(step1[31], step2[24]); - step3[25] = SUB_EPI16(step1[30], step2[25]); - step3[26] = SUB_EPI16(step1[29], step2[26]); - step3[27] = SUB_EPI16(step1[28], step2[27]); - step3[28] = ADD_EPI16(step2[27], step1[28]); - step3[29] = ADD_EPI16(step2[26], step1[29]); - step3[30] = ADD_EPI16(step2[25], step1[30]); - step3[31] = ADD_EPI16(step2[24], step1[31]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step3[16], &step3[17], &step3[18], &step3[19], &step3[20], - &step3[21], &step3[22], &step3[23], &step3[24], &step3[25], - &step3[26], &step3[27], &step3[28], &step3[29], &step3[30], - &step3[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - - // Stage 4 - { - step1[0] = ADD_EPI16(step3[3], step3[0]); - step1[1] = ADD_EPI16(step3[2], step3[1]); - step1[2] = SUB_EPI16(step3[1], step3[2]); - step1[3] = SUB_EPI16(step3[0], step3[3]); - step1[8] = ADD_EPI16(step3[11], step2[8]); - step1[9] = ADD_EPI16(step3[10], step2[9]); - step1[10] = SUB_EPI16(step2[9], step3[10]); - step1[11] = SUB_EPI16(step2[8], step3[11]); - step1[12] = SUB_EPI16(step2[15], step3[12]); - step1[13] = SUB_EPI16(step2[14], step3[13]); - step1[14] = ADD_EPI16(step3[13], step2[14]); - step1[15] = ADD_EPI16(step3[12], step2[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5], - &step1[6], &step1[7], &step1[8], &step1[9], &step1[10], - &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); - const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); - const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); - const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); - const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); - const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); - const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); - const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); - const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); - const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); - const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); - const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); - const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); - // Combine - step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); - step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&step1[5], &step1[6]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); - const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); - const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); - const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); - const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); - const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); - const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); - const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); - const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); - const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); - const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); - const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); - const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); - const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); - const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); - const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); - const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); - const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); - const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); - const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); - const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); - const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); - const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); - const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); - const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); - const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); - const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); - const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); - const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); - const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); - const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); - const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); - const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); - const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); - const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); - const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); - const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); - const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); - const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); - const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); - const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); - const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); - const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); - const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); - const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); - const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); - const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); - const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); - const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); - const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); - const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); - const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); - const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); - const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); - const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); - // Combine - step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); - step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); - step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); - step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); - step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); - step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); - step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); - step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20], - &step1[21], &step1[26], &step1[27], - &step1[28], &step1[29]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Stage 5 - { - step2[4] = ADD_EPI16(step1[5], step3[4]); - step2[5] = SUB_EPI16(step3[4], step1[5]); - step2[6] = SUB_EPI16(step3[7], step1[6]); - step2[7] = ADD_EPI16(step1[6], step3[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6], - &step2[7]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); - const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); - const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); - const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); - const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); - const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); - const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); - const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); - const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); - const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); - const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); - const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i out_00_4 = - _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); - const __m128i out_00_5 = - _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); - const __m128i out_16_4 = - _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); - const __m128i out_16_5 = - _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); - const __m128i out_08_4 = - _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); - const __m128i out_08_5 = - _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); - const __m128i out_24_4 = - _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); - const __m128i out_24_5 = - _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); - const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); - const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); - const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); - const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); - const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); - const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); - const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); - const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); - // Combine - out[0] = _mm_packs_epi32(out_00_6, out_00_7); - out[16] = _mm_packs_epi32(out_16_6, out_16_7); - out[8] = _mm_packs_epi32(out_08_6, out_08_7); - out[24] = _mm_packs_epi32(out_24_6, out_24_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]); - const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]); - const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); - const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); - const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); - const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); - const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); - const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); - const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); - const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); - const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); - const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); - // dct_const_round_shift - const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); - const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); - const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); - const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); - const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); - const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); - const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); - const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); - const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); - const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); - const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); - const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); - const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); - const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); - const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); - const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); - // Combine - step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7); - step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); - step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); - step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13], - &step2[14]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step2[16] = ADD_EPI16(step1[19], step3[16]); - step2[17] = ADD_EPI16(step1[18], step3[17]); - step2[18] = SUB_EPI16(step3[17], step1[18]); - step2[19] = SUB_EPI16(step3[16], step1[19]); - step2[20] = SUB_EPI16(step3[23], step1[20]); - step2[21] = SUB_EPI16(step3[22], step1[21]); - step2[22] = ADD_EPI16(step1[21], step3[22]); - step2[23] = ADD_EPI16(step1[20], step3[23]); - step2[24] = ADD_EPI16(step1[27], step3[24]); - step2[25] = ADD_EPI16(step1[26], step3[25]); - step2[26] = SUB_EPI16(step3[25], step1[26]); - step2[27] = SUB_EPI16(step3[24], step1[27]); - step2[28] = SUB_EPI16(step3[31], step1[28]); - step2[29] = SUB_EPI16(step3[30], step1[29]); - step2[30] = ADD_EPI16(step1[29], step3[30]); - step2[31] = ADD_EPI16(step1[28], step3[31]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step2[16], &step2[17], &step2[18], &step2[19], &step2[20], - &step2[21], &step2[22], &step2[23], &step2[24], &step2[25], - &step2[26], &step2[27], &step2[28], &step2[29], &step2[30], - &step2[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Stage 6 - { - const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); - const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); - const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); - const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); - const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); - const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); - const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); - const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); - const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); - const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); - const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); - const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); - // dct_const_round_shift - const __m128i out_04_4 = - _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); - const __m128i out_04_5 = - _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); - const __m128i out_20_4 = - _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); - const __m128i out_20_5 = - _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); - const __m128i out_12_4 = - _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); - const __m128i out_12_5 = - _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); - const __m128i out_28_4 = - _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); - const __m128i out_28_5 = - _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); - const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); - const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); - const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); - const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); - const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); - const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); - const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); - const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); - // Combine - out[4] = _mm_packs_epi32(out_04_6, out_04_7); - out[20] = _mm_packs_epi32(out_20_6, out_20_7); - out[12] = _mm_packs_epi32(out_12_6, out_12_7); - out[28] = _mm_packs_epi32(out_28_6, out_28_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step3[8] = ADD_EPI16(step2[9], step1[8]); - step3[9] = SUB_EPI16(step1[8], step2[9]); - step3[10] = SUB_EPI16(step1[11], step2[10]); - step3[11] = ADD_EPI16(step2[10], step1[11]); - step3[12] = ADD_EPI16(step2[13], step1[12]); - step3[13] = SUB_EPI16(step1[12], step2[13]); - step3[14] = SUB_EPI16(step1[15], step2[14]); - step3[15] = ADD_EPI16(step2[14], step1[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10], - &step3[11], &step3[12], &step3[13], - &step3[14], &step3[15]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); - const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); - const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); - const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); - const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); - const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); - const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); - const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); - const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); - const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); - const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); - const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); - const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); - const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); - const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); - const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); - const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); - const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); - const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); - const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); - const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); - const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); - const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); - const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); - // dct_const_round_shift - const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); - const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); - const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); - const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); - const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); - const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); - const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); - const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); - const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); - const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); - const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); - const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); - const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); - const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); - const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); - const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); - const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); - const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); - const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); - const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); - const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); - const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); - const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); - const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); - const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); - const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); - const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); - const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); - const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); - const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); - const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); - const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); - // Combine - step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); - step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); - step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); - step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); - // Combine - step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); - step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); - step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); - step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21], - &step3[22], &step3[25], &step3[26], - &step3[29], &step3[30]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Stage 7 - { - const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]); - const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]); - const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]); - const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]); - const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); - const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); - const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); - const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); - const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); - const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); - const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); - const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); - const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); - const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); - const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); - const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); - const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); - const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); - const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); - const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); - const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); - const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); - const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); - const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); - // dct_const_round_shift - const __m128i out_02_4 = - _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); - const __m128i out_02_5 = - _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); - const __m128i out_18_4 = - _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); - const __m128i out_18_5 = - _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); - const __m128i out_10_4 = - _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); - const __m128i out_10_5 = - _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); - const __m128i out_26_4 = - _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); - const __m128i out_26_5 = - _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); - const __m128i out_06_4 = - _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); - const __m128i out_06_5 = - _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); - const __m128i out_22_4 = - _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); - const __m128i out_22_5 = - _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); - const __m128i out_14_4 = - _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); - const __m128i out_14_5 = - _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); - const __m128i out_30_4 = - _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); - const __m128i out_30_5 = - _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); - const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); - const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); - const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); - const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); - const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); - const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); - const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); - const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); - const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); - const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); - const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); - const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); - const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); - const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); - const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); - const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); - // Combine - out[2] = _mm_packs_epi32(out_02_6, out_02_7); - out[18] = _mm_packs_epi32(out_18_6, out_18_7); - out[10] = _mm_packs_epi32(out_10_6, out_10_7); - out[26] = _mm_packs_epi32(out_26_6, out_26_7); - out[6] = _mm_packs_epi32(out_06_6, out_06_7); - out[22] = _mm_packs_epi32(out_22_6, out_22_7); - out[14] = _mm_packs_epi32(out_14_6, out_14_7); - out[30] = _mm_packs_epi32(out_30_6, out_30_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], - &out[6], &out[22], &out[14], &out[30]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - step1[16] = ADD_EPI16(step3[17], step2[16]); - step1[17] = SUB_EPI16(step2[16], step3[17]); - step1[18] = SUB_EPI16(step2[19], step3[18]); - step1[19] = ADD_EPI16(step3[18], step2[19]); - step1[20] = ADD_EPI16(step3[21], step2[20]); - step1[21] = SUB_EPI16(step2[20], step3[21]); - step1[22] = SUB_EPI16(step2[23], step3[22]); - step1[23] = ADD_EPI16(step3[22], step2[23]); - step1[24] = ADD_EPI16(step3[25], step2[24]); - step1[25] = SUB_EPI16(step2[24], step3[25]); - step1[26] = SUB_EPI16(step2[27], step3[26]); - step1[27] = ADD_EPI16(step3[26], step2[27]); - step1[28] = ADD_EPI16(step3[29], step2[28]); - step1[29] = SUB_EPI16(step2[28], step3[29]); - step1[30] = SUB_EPI16(step2[31], step3[30]); - step1[31] = ADD_EPI16(step3[30], step2[31]); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x16( - &step1[16], &step1[17], &step1[18], &step1[19], &step1[20], - &step1[21], &step1[22], &step1[23], &step1[24], &step1[25], - &step1[26], &step1[27], &step1[28], &step1[29], &step1[30], - &step1[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Final stage --- outputs indices are bit-reversed. - { - const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); - const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); - const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); - const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); - const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); - const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); - const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); - const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); - const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); - const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); - const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); - const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); - const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); - const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); - const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); - const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); - const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); - const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); - const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); - const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); - const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); - const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); - const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); - const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); - // dct_const_round_shift - const __m128i out_01_4 = - _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); - const __m128i out_01_5 = - _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); - const __m128i out_17_4 = - _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); - const __m128i out_17_5 = - _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); - const __m128i out_09_4 = - _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); - const __m128i out_09_5 = - _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); - const __m128i out_25_4 = - _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); - const __m128i out_25_5 = - _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); - const __m128i out_07_4 = - _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); - const __m128i out_07_5 = - _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); - const __m128i out_23_4 = - _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); - const __m128i out_23_5 = - _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); - const __m128i out_15_4 = - _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); - const __m128i out_15_5 = - _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); - const __m128i out_31_4 = - _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); - const __m128i out_31_5 = - _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); - const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); - const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); - const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); - const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); - const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); - const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); - const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); - const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); - const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); - const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); - const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); - const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); - const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); - const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); - const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); - const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); - // Combine - out[1] = _mm_packs_epi32(out_01_6, out_01_7); - out[17] = _mm_packs_epi32(out_17_6, out_17_7); - out[9] = _mm_packs_epi32(out_09_6, out_09_7); - out[25] = _mm_packs_epi32(out_25_6, out_25_7); - out[7] = _mm_packs_epi32(out_07_6, out_07_7); - out[23] = _mm_packs_epi32(out_23_6, out_23_7); - out[15] = _mm_packs_epi32(out_15_6, out_15_7); - out[31] = _mm_packs_epi32(out_31_6, out_31_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], - &out[7], &out[23], &out[15], &out[31]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); - const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); - const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); - const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); - const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); - const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); - const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); - const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); - const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); - const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); - const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); - const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); - const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); - const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); - const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); - const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); - const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); - const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); - const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); - const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); - const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); - const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); - const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); - const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); - // dct_const_round_shift - const __m128i out_05_4 = - _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); - const __m128i out_05_5 = - _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); - const __m128i out_21_4 = - _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); - const __m128i out_21_5 = - _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); - const __m128i out_13_4 = - _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); - const __m128i out_13_5 = - _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); - const __m128i out_29_4 = - _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); - const __m128i out_29_5 = - _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); - const __m128i out_03_4 = - _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); - const __m128i out_03_5 = - _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); - const __m128i out_19_4 = - _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); - const __m128i out_19_5 = - _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); - const __m128i out_11_4 = - _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); - const __m128i out_11_5 = - _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); - const __m128i out_27_4 = - _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); - const __m128i out_27_5 = - _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); - const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); - const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); - const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); - const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); - const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); - const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); - const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); - const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); - const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); - const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); - const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); - const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); - const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); - const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); - const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); - const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); - // Combine - out[5] = _mm_packs_epi32(out_05_6, out_05_7); - out[21] = _mm_packs_epi32(out_21_6, out_21_7); - out[13] = _mm_packs_epi32(out_13_6, out_13_7); - out[29] = _mm_packs_epi32(out_29_6, out_29_7); - out[3] = _mm_packs_epi32(out_03_6, out_03_7); - out[19] = _mm_packs_epi32(out_19_6, out_19_7); - out[11] = _mm_packs_epi32(out_11_6, out_11_7); - out[27] = _mm_packs_epi32(out_27_6, out_27_7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], - &out[3], &out[19], &out[11], &out[27]); - if (overflow) { - if (pass == 0) - HIGH_FDCT32x32_2D_C(input, output_org, stride); - else - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } -#if FDCT32x32_HIGH_PRECISION - } else { - __m128i lstep1[64], lstep2[64], lstep3[64]; - __m128i u[32], v[32], sign[16]; - const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); - // start using 32-bit operations - // stage 3 - { - // expanding to 32-bit length priori to addition operations - lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero); - lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero); - lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero); - lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero); - lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero); - lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero); - lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero); - lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero); - lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero); - lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero); - lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero); - lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero); - lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero); - lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero); - lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero); - lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero); - lstep2[0] = _mm_madd_epi16(lstep2[0], kOne); - lstep2[1] = _mm_madd_epi16(lstep2[1], kOne); - lstep2[2] = _mm_madd_epi16(lstep2[2], kOne); - lstep2[3] = _mm_madd_epi16(lstep2[3], kOne); - lstep2[4] = _mm_madd_epi16(lstep2[4], kOne); - lstep2[5] = _mm_madd_epi16(lstep2[5], kOne); - lstep2[6] = _mm_madd_epi16(lstep2[6], kOne); - lstep2[7] = _mm_madd_epi16(lstep2[7], kOne); - lstep2[8] = _mm_madd_epi16(lstep2[8], kOne); - lstep2[9] = _mm_madd_epi16(lstep2[9], kOne); - lstep2[10] = _mm_madd_epi16(lstep2[10], kOne); - lstep2[11] = _mm_madd_epi16(lstep2[11], kOne); - lstep2[12] = _mm_madd_epi16(lstep2[12], kOne); - lstep2[13] = _mm_madd_epi16(lstep2[13], kOne); - lstep2[14] = _mm_madd_epi16(lstep2[14], kOne); - lstep2[15] = _mm_madd_epi16(lstep2[15], kOne); - - lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]); - lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]); - lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]); - lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]); - lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]); - lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]); - lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]); - lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]); - lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]); - lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]); - lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]); - lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]); - lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]); - lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]); - lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]); - lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]); - } - { - const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); - const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); - const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); - const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); - const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); - const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); - const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); - const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); - const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); - const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); - const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); - const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); - const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); - const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); - const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); - const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); - const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); - const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); - const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); - lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); - lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); - lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); - lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); - lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); - lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); - lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); - lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); - } - { - lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero); - lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero); - lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero); - lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero); - lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero); - lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero); - lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero); - lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero); - lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero); - lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero); - lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero); - lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero); - lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero); - lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero); - lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero); - lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero); - lstep2[40] = _mm_madd_epi16(lstep2[40], kOne); - lstep2[41] = _mm_madd_epi16(lstep2[41], kOne); - lstep2[42] = _mm_madd_epi16(lstep2[42], kOne); - lstep2[43] = _mm_madd_epi16(lstep2[43], kOne); - lstep2[44] = _mm_madd_epi16(lstep2[44], kOne); - lstep2[45] = _mm_madd_epi16(lstep2[45], kOne); - lstep2[46] = _mm_madd_epi16(lstep2[46], kOne); - lstep2[47] = _mm_madd_epi16(lstep2[47], kOne); - lstep2[48] = _mm_madd_epi16(lstep2[48], kOne); - lstep2[49] = _mm_madd_epi16(lstep2[49], kOne); - lstep2[50] = _mm_madd_epi16(lstep2[50], kOne); - lstep2[51] = _mm_madd_epi16(lstep2[51], kOne); - lstep2[52] = _mm_madd_epi16(lstep2[52], kOne); - lstep2[53] = _mm_madd_epi16(lstep2[53], kOne); - lstep2[54] = _mm_madd_epi16(lstep2[54], kOne); - lstep2[55] = _mm_madd_epi16(lstep2[55], kOne); - - lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero); - lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero); - lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero); - lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero); - lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero); - lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero); - lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero); - lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero); - lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero); - lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero); - lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero); - lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero); - lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero); - lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero); - lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero); - lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero); - lstep1[32] = _mm_madd_epi16(lstep1[32], kOne); - lstep1[33] = _mm_madd_epi16(lstep1[33], kOne); - lstep1[34] = _mm_madd_epi16(lstep1[34], kOne); - lstep1[35] = _mm_madd_epi16(lstep1[35], kOne); - lstep1[36] = _mm_madd_epi16(lstep1[36], kOne); - lstep1[37] = _mm_madd_epi16(lstep1[37], kOne); - lstep1[38] = _mm_madd_epi16(lstep1[38], kOne); - lstep1[39] = _mm_madd_epi16(lstep1[39], kOne); - lstep1[56] = _mm_madd_epi16(lstep1[56], kOne); - lstep1[57] = _mm_madd_epi16(lstep1[57], kOne); - lstep1[58] = _mm_madd_epi16(lstep1[58], kOne); - lstep1[59] = _mm_madd_epi16(lstep1[59], kOne); - lstep1[60] = _mm_madd_epi16(lstep1[60], kOne); - lstep1[61] = _mm_madd_epi16(lstep1[61], kOne); - lstep1[62] = _mm_madd_epi16(lstep1[62], kOne); - lstep1[63] = _mm_madd_epi16(lstep1[63], kOne); - - lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); - lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); - - lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); - lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); - lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); - lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); - lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); - lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); - lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); - lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); - lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); - lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); - lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]); - lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]); - lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]); - lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]); - lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]); - lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]); - lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]); - lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]); - lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]); - lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]); - lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]); - lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]); - lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]); - lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]); - lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]); - lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); - lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); - lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); - lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); - lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); - } - - // stage 4 - { - // expanding to 32-bit length priori to addition operations - lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero); - lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero); - lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero); - lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero); - lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); - lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); - lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); - lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero); - lstep2[16] = _mm_madd_epi16(lstep2[16], kOne); - lstep2[17] = _mm_madd_epi16(lstep2[17], kOne); - lstep2[18] = _mm_madd_epi16(lstep2[18], kOne); - lstep2[19] = _mm_madd_epi16(lstep2[19], kOne); - lstep2[28] = _mm_madd_epi16(lstep2[28], kOne); - lstep2[29] = _mm_madd_epi16(lstep2[29], kOne); - lstep2[30] = _mm_madd_epi16(lstep2[30], kOne); - lstep2[31] = _mm_madd_epi16(lstep2[31], kOne); - - lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]); - lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]); - lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]); - lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]); - lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]); - lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]); - lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]); - lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]); - lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); - lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); - lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); - lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); - lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); - lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); - lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); - lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); - lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); - lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); - lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); - lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); - lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); - lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); - lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); - lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); - } - { - // to be continued... - // - const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); - const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); - - u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); - u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); - u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); - u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); - - // TODO(jingning): manually inline k_madd_epi32_ to further hide - // instruction latency. - v[0] = k_madd_epi32(u[0], k32_p16_m16); - v[1] = k_madd_epi32(u[1], k32_p16_m16); - v[2] = k_madd_epi32(u[2], k32_p16_m16); - v[3] = k_madd_epi32(u[3], k32_p16_m16); - v[4] = k_madd_epi32(u[0], k32_p16_p16); - v[5] = k_madd_epi32(u[1], k32_p16_p16); - v[6] = k_madd_epi32(u[2], k32_p16_p16); - v[7] = k_madd_epi32(u[3], k32_p16_p16); -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4], - &v[5], &v[6], &v[7], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - - lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - } - { - const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); - const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); - const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); - - u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); - u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); - u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); - u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); - u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); - u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); - u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); - u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); - u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); - u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); - u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); - u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); - u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); - u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]); - u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); - u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); - - v[0] = k_madd_epi32(u[0], k32_m08_p24); - v[1] = k_madd_epi32(u[1], k32_m08_p24); - v[2] = k_madd_epi32(u[2], k32_m08_p24); - v[3] = k_madd_epi32(u[3], k32_m08_p24); - v[4] = k_madd_epi32(u[4], k32_m08_p24); - v[5] = k_madd_epi32(u[5], k32_m08_p24); - v[6] = k_madd_epi32(u[6], k32_m08_p24); - v[7] = k_madd_epi32(u[7], k32_m08_p24); - v[8] = k_madd_epi32(u[8], k32_m24_m08); - v[9] = k_madd_epi32(u[9], k32_m24_m08); - v[10] = k_madd_epi32(u[10], k32_m24_m08); - v[11] = k_madd_epi32(u[11], k32_m24_m08); - v[12] = k_madd_epi32(u[12], k32_m24_m08); - v[13] = k_madd_epi32(u[13], k32_m24_m08); - v[14] = k_madd_epi32(u[14], k32_m24_m08); - v[15] = k_madd_epi32(u[15], k32_m24_m08); - v[16] = k_madd_epi32(u[12], k32_m08_p24); - v[17] = k_madd_epi32(u[13], k32_m08_p24); - v[18] = k_madd_epi32(u[14], k32_m08_p24); - v[19] = k_madd_epi32(u[15], k32_m08_p24); - v[20] = k_madd_epi32(u[8], k32_m08_p24); - v[21] = k_madd_epi32(u[9], k32_m08_p24); - v[22] = k_madd_epi32(u[10], k32_m08_p24); - v[23] = k_madd_epi32(u[11], k32_m08_p24); - v[24] = k_madd_epi32(u[4], k32_p24_p08); - v[25] = k_madd_epi32(u[5], k32_p24_p08); - v[26] = k_madd_epi32(u[6], k32_p24_p08); - v[27] = k_madd_epi32(u[7], k32_p24_p08); - v[28] = k_madd_epi32(u[0], k32_p24_p08); - v[29] = k_madd_epi32(u[1], k32_p24_p08); - v[30] = k_madd_epi32(u[2], k32_p24_p08); - v[31] = k_madd_epi32(u[3], k32_p24_p08); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - } - // stage 5 - { - lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]); - lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]); - lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]); - lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]); - lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]); - lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]); - lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]); - lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]); - } - { - const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); - const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); - const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); - const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); - - u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]); - u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]); - u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]); - u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]); - u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]); - u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]); - u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]); - u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]); - - // TODO(jingning): manually inline k_madd_epi32_ to further hide - // instruction latency. - v[0] = k_madd_epi32(u[0], k32_p16_p16); - v[1] = k_madd_epi32(u[1], k32_p16_p16); - v[2] = k_madd_epi32(u[2], k32_p16_p16); - v[3] = k_madd_epi32(u[3], k32_p16_p16); - v[4] = k_madd_epi32(u[0], k32_p16_m16); - v[5] = k_madd_epi32(u[1], k32_p16_m16); - v[6] = k_madd_epi32(u[2], k32_p16_m16); - v[7] = k_madd_epi32(u[3], k32_p16_m16); - v[8] = k_madd_epi32(u[4], k32_p24_p08); - v[9] = k_madd_epi32(u[5], k32_p24_p08); - v[10] = k_madd_epi32(u[6], k32_p24_p08); - v[11] = k_madd_epi32(u[7], k32_p24_p08); - v[12] = k_madd_epi32(u[4], k32_m08_p24); - v[13] = k_madd_epi32(u[5], k32_m08_p24); - v[14] = k_madd_epi32(u[6], k32_m08_p24); - v[15] = k_madd_epi32(u[7], k32_m08_p24); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - sign[0] = _mm_cmplt_epi32(u[0], kZero); - sign[1] = _mm_cmplt_epi32(u[1], kZero); - sign[2] = _mm_cmplt_epi32(u[2], kZero); - sign[3] = _mm_cmplt_epi32(u[3], kZero); - sign[4] = _mm_cmplt_epi32(u[4], kZero); - sign[5] = _mm_cmplt_epi32(u[5], kZero); - sign[6] = _mm_cmplt_epi32(u[6], kZero); - sign[7] = _mm_cmplt_epi32(u[7], kZero); - - u[0] = _mm_sub_epi32(u[0], sign[0]); - u[1] = _mm_sub_epi32(u[1], sign[1]); - u[2] = _mm_sub_epi32(u[2], sign[2]); - u[3] = _mm_sub_epi32(u[3], sign[3]); - u[4] = _mm_sub_epi32(u[4], sign[4]); - u[5] = _mm_sub_epi32(u[5], sign[5]); - u[6] = _mm_sub_epi32(u[6], sign[6]); - u[7] = _mm_sub_epi32(u[7], sign[7]); - - u[0] = _mm_add_epi32(u[0], K32One); - u[1] = _mm_add_epi32(u[1], K32One); - u[2] = _mm_add_epi32(u[2], K32One); - u[3] = _mm_add_epi32(u[3], K32One); - u[4] = _mm_add_epi32(u[4], K32One); - u[5] = _mm_add_epi32(u[5], K32One); - u[6] = _mm_add_epi32(u[6], K32One); - u[7] = _mm_add_epi32(u[7], K32One); - - u[0] = _mm_srai_epi32(u[0], 2); - u[1] = _mm_srai_epi32(u[1], 2); - u[2] = _mm_srai_epi32(u[2], 2); - u[3] = _mm_srai_epi32(u[3], 2); - u[4] = _mm_srai_epi32(u[4], 2); - u[5] = _mm_srai_epi32(u[5], 2); - u[6] = _mm_srai_epi32(u[6], 2); - u[7] = _mm_srai_epi32(u[7], 2); - - // Combine - out[0] = _mm_packs_epi32(u[0], u[1]); - out[16] = _mm_packs_epi32(u[2], u[3]); - out[8] = _mm_packs_epi32(u[4], u[5]); - out[24] = _mm_packs_epi32(u[6], u[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); - const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); - const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); - - u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]); - u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]); - u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]); - u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]); - u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]); - u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]); - u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]); - u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]); - - v[0] = k_madd_epi32(u[0], k32_m08_p24); - v[1] = k_madd_epi32(u[1], k32_m08_p24); - v[2] = k_madd_epi32(u[2], k32_m08_p24); - v[3] = k_madd_epi32(u[3], k32_m08_p24); - v[4] = k_madd_epi32(u[4], k32_m24_m08); - v[5] = k_madd_epi32(u[5], k32_m24_m08); - v[6] = k_madd_epi32(u[6], k32_m24_m08); - v[7] = k_madd_epi32(u[7], k32_m24_m08); - v[8] = k_madd_epi32(u[4], k32_m08_p24); - v[9] = k_madd_epi32(u[5], k32_m08_p24); - v[10] = k_madd_epi32(u[6], k32_m08_p24); - v[11] = k_madd_epi32(u[7], k32_m08_p24); - v[12] = k_madd_epi32(u[0], k32_p24_p08); - v[13] = k_madd_epi32(u[1], k32_p24_p08); - v[14] = k_madd_epi32(u[2], k32_p24_p08); - v[15] = k_madd_epi32(u[3], k32_p24_p08); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - } - { - lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]); - lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]); - lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]); - lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]); - lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]); - lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]); - lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]); - lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]); - lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]); - lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]); - lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]); - lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]); - lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]); - lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]); - lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]); - lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]); - lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]); - lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]); - lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]); - lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]); - lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]); - lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]); - lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]); - lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]); - lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]); - lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]); - lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]); - lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]); - lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]); - lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]); - lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]); - lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]); - } - // stage 6 - { - const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); - const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); - const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); - const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); - - u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); - u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); - u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); - u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); - u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); - u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); - u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); - u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); - u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); - u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); - u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); - u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); - u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]); - u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]); - u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]); - u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]); - - v[0] = k_madd_epi32(u[0], k32_p28_p04); - v[1] = k_madd_epi32(u[1], k32_p28_p04); - v[2] = k_madd_epi32(u[2], k32_p28_p04); - v[3] = k_madd_epi32(u[3], k32_p28_p04); - v[4] = k_madd_epi32(u[4], k32_p12_p20); - v[5] = k_madd_epi32(u[5], k32_p12_p20); - v[6] = k_madd_epi32(u[6], k32_p12_p20); - v[7] = k_madd_epi32(u[7], k32_p12_p20); - v[8] = k_madd_epi32(u[8], k32_m20_p12); - v[9] = k_madd_epi32(u[9], k32_m20_p12); - v[10] = k_madd_epi32(u[10], k32_m20_p12); - v[11] = k_madd_epi32(u[11], k32_m20_p12); - v[12] = k_madd_epi32(u[12], k32_m04_p28); - v[13] = k_madd_epi32(u[13], k32_m04_p28); - v[14] = k_madd_epi32(u[14], k32_m04_p28); - v[15] = k_madd_epi32(u[15], k32_m04_p28); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_16( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - sign[0] = _mm_cmplt_epi32(u[0], kZero); - sign[1] = _mm_cmplt_epi32(u[1], kZero); - sign[2] = _mm_cmplt_epi32(u[2], kZero); - sign[3] = _mm_cmplt_epi32(u[3], kZero); - sign[4] = _mm_cmplt_epi32(u[4], kZero); - sign[5] = _mm_cmplt_epi32(u[5], kZero); - sign[6] = _mm_cmplt_epi32(u[6], kZero); - sign[7] = _mm_cmplt_epi32(u[7], kZero); - - u[0] = _mm_sub_epi32(u[0], sign[0]); - u[1] = _mm_sub_epi32(u[1], sign[1]); - u[2] = _mm_sub_epi32(u[2], sign[2]); - u[3] = _mm_sub_epi32(u[3], sign[3]); - u[4] = _mm_sub_epi32(u[4], sign[4]); - u[5] = _mm_sub_epi32(u[5], sign[5]); - u[6] = _mm_sub_epi32(u[6], sign[6]); - u[7] = _mm_sub_epi32(u[7], sign[7]); - - u[0] = _mm_add_epi32(u[0], K32One); - u[1] = _mm_add_epi32(u[1], K32One); - u[2] = _mm_add_epi32(u[2], K32One); - u[3] = _mm_add_epi32(u[3], K32One); - u[4] = _mm_add_epi32(u[4], K32One); - u[5] = _mm_add_epi32(u[5], K32One); - u[6] = _mm_add_epi32(u[6], K32One); - u[7] = _mm_add_epi32(u[7], K32One); - - u[0] = _mm_srai_epi32(u[0], 2); - u[1] = _mm_srai_epi32(u[1], 2); - u[2] = _mm_srai_epi32(u[2], 2); - u[3] = _mm_srai_epi32(u[3], 2); - u[4] = _mm_srai_epi32(u[4], 2); - u[5] = _mm_srai_epi32(u[5], 2); - u[6] = _mm_srai_epi32(u[6], 2); - u[7] = _mm_srai_epi32(u[7], 2); - - out[4] = _mm_packs_epi32(u[0], u[1]); - out[20] = _mm_packs_epi32(u[2], u[3]); - out[12] = _mm_packs_epi32(u[4], u[5]); - out[28] = _mm_packs_epi32(u[6], u[7]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]); - lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]); - lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]); - lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]); - lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]); - lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]); - lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]); - lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]); - lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]); - lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]); - lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]); - lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]); - lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]); - lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]); - lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]); - lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]); - } - { - const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); - const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64); - const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); - const __m128i k32_m12_m20 = - pair_set_epi32(-cospi_12_64, -cospi_20_64); - const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); - const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); - - u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); - u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); - u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); - u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); - u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); - u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); - u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); - u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); - u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); - u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); - u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]); - u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]); - u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]); - u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]); - u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]); - u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]); - - v[0] = k_madd_epi32(u[0], k32_m04_p28); - v[1] = k_madd_epi32(u[1], k32_m04_p28); - v[2] = k_madd_epi32(u[2], k32_m04_p28); - v[3] = k_madd_epi32(u[3], k32_m04_p28); - v[4] = k_madd_epi32(u[4], k32_m28_m04); - v[5] = k_madd_epi32(u[5], k32_m28_m04); - v[6] = k_madd_epi32(u[6], k32_m28_m04); - v[7] = k_madd_epi32(u[7], k32_m28_m04); - v[8] = k_madd_epi32(u[8], k32_m20_p12); - v[9] = k_madd_epi32(u[9], k32_m20_p12); - v[10] = k_madd_epi32(u[10], k32_m20_p12); - v[11] = k_madd_epi32(u[11], k32_m20_p12); - v[12] = k_madd_epi32(u[12], k32_m12_m20); - v[13] = k_madd_epi32(u[13], k32_m12_m20); - v[14] = k_madd_epi32(u[14], k32_m12_m20); - v[15] = k_madd_epi32(u[15], k32_m12_m20); - v[16] = k_madd_epi32(u[12], k32_m20_p12); - v[17] = k_madd_epi32(u[13], k32_m20_p12); - v[18] = k_madd_epi32(u[14], k32_m20_p12); - v[19] = k_madd_epi32(u[15], k32_m20_p12); - v[20] = k_madd_epi32(u[8], k32_p12_p20); - v[21] = k_madd_epi32(u[9], k32_p12_p20); - v[22] = k_madd_epi32(u[10], k32_p12_p20); - v[23] = k_madd_epi32(u[11], k32_p12_p20); - v[24] = k_madd_epi32(u[4], k32_m04_p28); - v[25] = k_madd_epi32(u[5], k32_m04_p28); - v[26] = k_madd_epi32(u[6], k32_m04_p28); - v[27] = k_madd_epi32(u[7], k32_m04_p28); - v[28] = k_madd_epi32(u[0], k32_p28_p04); - v[29] = k_madd_epi32(u[1], k32_p28_p04); - v[30] = k_madd_epi32(u[2], k32_p28_p04); - v[31] = k_madd_epi32(u[3], k32_p28_p04); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - } - // stage 7 - { - const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64); - const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64); - const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64); - const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); - const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64); - const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64); - const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64); - const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64); - - u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); - u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); - u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); - u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); - u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); - u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); - u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); - u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); - u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); - u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); - u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]); - u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]); - u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]); - u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]); - u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]); - u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]); - - v[0] = k_madd_epi32(u[0], k32_p30_p02); - v[1] = k_madd_epi32(u[1], k32_p30_p02); - v[2] = k_madd_epi32(u[2], k32_p30_p02); - v[3] = k_madd_epi32(u[3], k32_p30_p02); - v[4] = k_madd_epi32(u[4], k32_p14_p18); - v[5] = k_madd_epi32(u[5], k32_p14_p18); - v[6] = k_madd_epi32(u[6], k32_p14_p18); - v[7] = k_madd_epi32(u[7], k32_p14_p18); - v[8] = k_madd_epi32(u[8], k32_p22_p10); - v[9] = k_madd_epi32(u[9], k32_p22_p10); - v[10] = k_madd_epi32(u[10], k32_p22_p10); - v[11] = k_madd_epi32(u[11], k32_p22_p10); - v[12] = k_madd_epi32(u[12], k32_p06_p26); - v[13] = k_madd_epi32(u[13], k32_p06_p26); - v[14] = k_madd_epi32(u[14], k32_p06_p26); - v[15] = k_madd_epi32(u[15], k32_p06_p26); - v[16] = k_madd_epi32(u[12], k32_m26_p06); - v[17] = k_madd_epi32(u[13], k32_m26_p06); - v[18] = k_madd_epi32(u[14], k32_m26_p06); - v[19] = k_madd_epi32(u[15], k32_m26_p06); - v[20] = k_madd_epi32(u[8], k32_m10_p22); - v[21] = k_madd_epi32(u[9], k32_m10_p22); - v[22] = k_madd_epi32(u[10], k32_m10_p22); - v[23] = k_madd_epi32(u[11], k32_m10_p22); - v[24] = k_madd_epi32(u[4], k32_m18_p14); - v[25] = k_madd_epi32(u[5], k32_m18_p14); - v[26] = k_madd_epi32(u[6], k32_m18_p14); - v[27] = k_madd_epi32(u[7], k32_m18_p14); - v[28] = k_madd_epi32(u[0], k32_m02_p30); - v[29] = k_madd_epi32(u[1], k32_m02_p30); - v[30] = k_madd_epi32(u[2], k32_m02_p30); - v[31] = k_madd_epi32(u[3], k32_m02_p30); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm_cmplt_epi32(u[0], kZero); - v[1] = _mm_cmplt_epi32(u[1], kZero); - v[2] = _mm_cmplt_epi32(u[2], kZero); - v[3] = _mm_cmplt_epi32(u[3], kZero); - v[4] = _mm_cmplt_epi32(u[4], kZero); - v[5] = _mm_cmplt_epi32(u[5], kZero); - v[6] = _mm_cmplt_epi32(u[6], kZero); - v[7] = _mm_cmplt_epi32(u[7], kZero); - v[8] = _mm_cmplt_epi32(u[8], kZero); - v[9] = _mm_cmplt_epi32(u[9], kZero); - v[10] = _mm_cmplt_epi32(u[10], kZero); - v[11] = _mm_cmplt_epi32(u[11], kZero); - v[12] = _mm_cmplt_epi32(u[12], kZero); - v[13] = _mm_cmplt_epi32(u[13], kZero); - v[14] = _mm_cmplt_epi32(u[14], kZero); - v[15] = _mm_cmplt_epi32(u[15], kZero); - - u[0] = _mm_sub_epi32(u[0], v[0]); - u[1] = _mm_sub_epi32(u[1], v[1]); - u[2] = _mm_sub_epi32(u[2], v[2]); - u[3] = _mm_sub_epi32(u[3], v[3]); - u[4] = _mm_sub_epi32(u[4], v[4]); - u[5] = _mm_sub_epi32(u[5], v[5]); - u[6] = _mm_sub_epi32(u[6], v[6]); - u[7] = _mm_sub_epi32(u[7], v[7]); - u[8] = _mm_sub_epi32(u[8], v[8]); - u[9] = _mm_sub_epi32(u[9], v[9]); - u[10] = _mm_sub_epi32(u[10], v[10]); - u[11] = _mm_sub_epi32(u[11], v[11]); - u[12] = _mm_sub_epi32(u[12], v[12]); - u[13] = _mm_sub_epi32(u[13], v[13]); - u[14] = _mm_sub_epi32(u[14], v[14]); - u[15] = _mm_sub_epi32(u[15], v[15]); - - v[0] = _mm_add_epi32(u[0], K32One); - v[1] = _mm_add_epi32(u[1], K32One); - v[2] = _mm_add_epi32(u[2], K32One); - v[3] = _mm_add_epi32(u[3], K32One); - v[4] = _mm_add_epi32(u[4], K32One); - v[5] = _mm_add_epi32(u[5], K32One); - v[6] = _mm_add_epi32(u[6], K32One); - v[7] = _mm_add_epi32(u[7], K32One); - v[8] = _mm_add_epi32(u[8], K32One); - v[9] = _mm_add_epi32(u[9], K32One); - v[10] = _mm_add_epi32(u[10], K32One); - v[11] = _mm_add_epi32(u[11], K32One); - v[12] = _mm_add_epi32(u[12], K32One); - v[13] = _mm_add_epi32(u[13], K32One); - v[14] = _mm_add_epi32(u[14], K32One); - v[15] = _mm_add_epi32(u[15], K32One); - - u[0] = _mm_srai_epi32(v[0], 2); - u[1] = _mm_srai_epi32(v[1], 2); - u[2] = _mm_srai_epi32(v[2], 2); - u[3] = _mm_srai_epi32(v[3], 2); - u[4] = _mm_srai_epi32(v[4], 2); - u[5] = _mm_srai_epi32(v[5], 2); - u[6] = _mm_srai_epi32(v[6], 2); - u[7] = _mm_srai_epi32(v[7], 2); - u[8] = _mm_srai_epi32(v[8], 2); - u[9] = _mm_srai_epi32(v[9], 2); - u[10] = _mm_srai_epi32(v[10], 2); - u[11] = _mm_srai_epi32(v[11], 2); - u[12] = _mm_srai_epi32(v[12], 2); - u[13] = _mm_srai_epi32(v[13], 2); - u[14] = _mm_srai_epi32(v[14], 2); - u[15] = _mm_srai_epi32(v[15], 2); - - out[2] = _mm_packs_epi32(u[0], u[1]); - out[18] = _mm_packs_epi32(u[2], u[3]); - out[10] = _mm_packs_epi32(u[4], u[5]); - out[26] = _mm_packs_epi32(u[6], u[7]); - out[6] = _mm_packs_epi32(u[8], u[9]); - out[22] = _mm_packs_epi32(u[10], u[11]); - out[14] = _mm_packs_epi32(u[12], u[13]); - out[30] = _mm_packs_epi32(u[14], u[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26], - &out[6], &out[22], &out[14], &out[30]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]); - lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]); - lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]); - lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]); - lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]); - lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]); - lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]); - lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]); - lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]); - lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]); - lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]); - lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]); - lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]); - lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]); - lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]); - lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]); - lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]); - lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]); - lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]); - lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]); - lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]); - lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]); - lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]); - lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]); - lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]); - lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]); - lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]); - lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]); - lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]); - lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]); - lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]); - lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]); - } - // stage 8 - { - const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64); - const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64); - const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64); - const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64); - const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64); - const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64); - const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64); - const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64); - - u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); - u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); - u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); - u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); - u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); - u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); - u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); - u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); - u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); - u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); - u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]); - u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]); - u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]); - u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]); - u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]); - u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]); - - v[0] = k_madd_epi32(u[0], k32_p31_p01); - v[1] = k_madd_epi32(u[1], k32_p31_p01); - v[2] = k_madd_epi32(u[2], k32_p31_p01); - v[3] = k_madd_epi32(u[3], k32_p31_p01); - v[4] = k_madd_epi32(u[4], k32_p15_p17); - v[5] = k_madd_epi32(u[5], k32_p15_p17); - v[6] = k_madd_epi32(u[6], k32_p15_p17); - v[7] = k_madd_epi32(u[7], k32_p15_p17); - v[8] = k_madd_epi32(u[8], k32_p23_p09); - v[9] = k_madd_epi32(u[9], k32_p23_p09); - v[10] = k_madd_epi32(u[10], k32_p23_p09); - v[11] = k_madd_epi32(u[11], k32_p23_p09); - v[12] = k_madd_epi32(u[12], k32_p07_p25); - v[13] = k_madd_epi32(u[13], k32_p07_p25); - v[14] = k_madd_epi32(u[14], k32_p07_p25); - v[15] = k_madd_epi32(u[15], k32_p07_p25); - v[16] = k_madd_epi32(u[12], k32_m25_p07); - v[17] = k_madd_epi32(u[13], k32_m25_p07); - v[18] = k_madd_epi32(u[14], k32_m25_p07); - v[19] = k_madd_epi32(u[15], k32_m25_p07); - v[20] = k_madd_epi32(u[8], k32_m09_p23); - v[21] = k_madd_epi32(u[9], k32_m09_p23); - v[22] = k_madd_epi32(u[10], k32_m09_p23); - v[23] = k_madd_epi32(u[11], k32_m09_p23); - v[24] = k_madd_epi32(u[4], k32_m17_p15); - v[25] = k_madd_epi32(u[5], k32_m17_p15); - v[26] = k_madd_epi32(u[6], k32_m17_p15); - v[27] = k_madd_epi32(u[7], k32_m17_p15); - v[28] = k_madd_epi32(u[0], k32_m01_p31); - v[29] = k_madd_epi32(u[1], k32_m01_p31); - v[30] = k_madd_epi32(u[2], k32_m01_p31); - v[31] = k_madd_epi32(u[3], k32_m01_p31); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm_cmplt_epi32(u[0], kZero); - v[1] = _mm_cmplt_epi32(u[1], kZero); - v[2] = _mm_cmplt_epi32(u[2], kZero); - v[3] = _mm_cmplt_epi32(u[3], kZero); - v[4] = _mm_cmplt_epi32(u[4], kZero); - v[5] = _mm_cmplt_epi32(u[5], kZero); - v[6] = _mm_cmplt_epi32(u[6], kZero); - v[7] = _mm_cmplt_epi32(u[7], kZero); - v[8] = _mm_cmplt_epi32(u[8], kZero); - v[9] = _mm_cmplt_epi32(u[9], kZero); - v[10] = _mm_cmplt_epi32(u[10], kZero); - v[11] = _mm_cmplt_epi32(u[11], kZero); - v[12] = _mm_cmplt_epi32(u[12], kZero); - v[13] = _mm_cmplt_epi32(u[13], kZero); - v[14] = _mm_cmplt_epi32(u[14], kZero); - v[15] = _mm_cmplt_epi32(u[15], kZero); - - u[0] = _mm_sub_epi32(u[0], v[0]); - u[1] = _mm_sub_epi32(u[1], v[1]); - u[2] = _mm_sub_epi32(u[2], v[2]); - u[3] = _mm_sub_epi32(u[3], v[3]); - u[4] = _mm_sub_epi32(u[4], v[4]); - u[5] = _mm_sub_epi32(u[5], v[5]); - u[6] = _mm_sub_epi32(u[6], v[6]); - u[7] = _mm_sub_epi32(u[7], v[7]); - u[8] = _mm_sub_epi32(u[8], v[8]); - u[9] = _mm_sub_epi32(u[9], v[9]); - u[10] = _mm_sub_epi32(u[10], v[10]); - u[11] = _mm_sub_epi32(u[11], v[11]); - u[12] = _mm_sub_epi32(u[12], v[12]); - u[13] = _mm_sub_epi32(u[13], v[13]); - u[14] = _mm_sub_epi32(u[14], v[14]); - u[15] = _mm_sub_epi32(u[15], v[15]); - - v[0] = _mm_add_epi32(u[0], K32One); - v[1] = _mm_add_epi32(u[1], K32One); - v[2] = _mm_add_epi32(u[2], K32One); - v[3] = _mm_add_epi32(u[3], K32One); - v[4] = _mm_add_epi32(u[4], K32One); - v[5] = _mm_add_epi32(u[5], K32One); - v[6] = _mm_add_epi32(u[6], K32One); - v[7] = _mm_add_epi32(u[7], K32One); - v[8] = _mm_add_epi32(u[8], K32One); - v[9] = _mm_add_epi32(u[9], K32One); - v[10] = _mm_add_epi32(u[10], K32One); - v[11] = _mm_add_epi32(u[11], K32One); - v[12] = _mm_add_epi32(u[12], K32One); - v[13] = _mm_add_epi32(u[13], K32One); - v[14] = _mm_add_epi32(u[14], K32One); - v[15] = _mm_add_epi32(u[15], K32One); - - u[0] = _mm_srai_epi32(v[0], 2); - u[1] = _mm_srai_epi32(v[1], 2); - u[2] = _mm_srai_epi32(v[2], 2); - u[3] = _mm_srai_epi32(v[3], 2); - u[4] = _mm_srai_epi32(v[4], 2); - u[5] = _mm_srai_epi32(v[5], 2); - u[6] = _mm_srai_epi32(v[6], 2); - u[7] = _mm_srai_epi32(v[7], 2); - u[8] = _mm_srai_epi32(v[8], 2); - u[9] = _mm_srai_epi32(v[9], 2); - u[10] = _mm_srai_epi32(v[10], 2); - u[11] = _mm_srai_epi32(v[11], 2); - u[12] = _mm_srai_epi32(v[12], 2); - u[13] = _mm_srai_epi32(v[13], 2); - u[14] = _mm_srai_epi32(v[14], 2); - u[15] = _mm_srai_epi32(v[15], 2); - - out[1] = _mm_packs_epi32(u[0], u[1]); - out[17] = _mm_packs_epi32(u[2], u[3]); - out[9] = _mm_packs_epi32(u[4], u[5]); - out[25] = _mm_packs_epi32(u[6], u[7]); - out[7] = _mm_packs_epi32(u[8], u[9]); - out[23] = _mm_packs_epi32(u[10], u[11]); - out[15] = _mm_packs_epi32(u[12], u[13]); - out[31] = _mm_packs_epi32(u[14], u[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25], - &out[7], &out[23], &out[15], &out[31]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64); - const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64); - const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64); - const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64); - const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64); - const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64); - const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64); - const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64); - - u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); - u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); - u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); - u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); - u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); - u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); - u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); - u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); - u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); - u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); - u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]); - u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]); - u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]); - u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]); - u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]); - u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]); - - v[0] = k_madd_epi32(u[0], k32_p27_p05); - v[1] = k_madd_epi32(u[1], k32_p27_p05); - v[2] = k_madd_epi32(u[2], k32_p27_p05); - v[3] = k_madd_epi32(u[3], k32_p27_p05); - v[4] = k_madd_epi32(u[4], k32_p11_p21); - v[5] = k_madd_epi32(u[5], k32_p11_p21); - v[6] = k_madd_epi32(u[6], k32_p11_p21); - v[7] = k_madd_epi32(u[7], k32_p11_p21); - v[8] = k_madd_epi32(u[8], k32_p19_p13); - v[9] = k_madd_epi32(u[9], k32_p19_p13); - v[10] = k_madd_epi32(u[10], k32_p19_p13); - v[11] = k_madd_epi32(u[11], k32_p19_p13); - v[12] = k_madd_epi32(u[12], k32_p03_p29); - v[13] = k_madd_epi32(u[13], k32_p03_p29); - v[14] = k_madd_epi32(u[14], k32_p03_p29); - v[15] = k_madd_epi32(u[15], k32_p03_p29); - v[16] = k_madd_epi32(u[12], k32_m29_p03); - v[17] = k_madd_epi32(u[13], k32_m29_p03); - v[18] = k_madd_epi32(u[14], k32_m29_p03); - v[19] = k_madd_epi32(u[15], k32_m29_p03); - v[20] = k_madd_epi32(u[8], k32_m13_p19); - v[21] = k_madd_epi32(u[9], k32_m13_p19); - v[22] = k_madd_epi32(u[10], k32_m13_p19); - v[23] = k_madd_epi32(u[11], k32_m13_p19); - v[24] = k_madd_epi32(u[4], k32_m21_p11); - v[25] = k_madd_epi32(u[5], k32_m21_p11); - v[26] = k_madd_epi32(u[6], k32_m21_p11); - v[27] = k_madd_epi32(u[7], k32_m21_p11); - v[28] = k_madd_epi32(u[0], k32_m05_p27); - v[29] = k_madd_epi32(u[1], k32_m05_p27); - v[30] = k_madd_epi32(u[2], k32_m05_p27); - v[31] = k_madd_epi32(u[3], k32_m05_p27); - -#if DCT_HIGH_BIT_DEPTH - overflow = k_check_epi32_overflow_32( - &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8], - &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16], - &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24], - &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - u[0] = k_packs_epi64(v[0], v[1]); - u[1] = k_packs_epi64(v[2], v[3]); - u[2] = k_packs_epi64(v[4], v[5]); - u[3] = k_packs_epi64(v[6], v[7]); - u[4] = k_packs_epi64(v[8], v[9]); - u[5] = k_packs_epi64(v[10], v[11]); - u[6] = k_packs_epi64(v[12], v[13]); - u[7] = k_packs_epi64(v[14], v[15]); - u[8] = k_packs_epi64(v[16], v[17]); - u[9] = k_packs_epi64(v[18], v[19]); - u[10] = k_packs_epi64(v[20], v[21]); - u[11] = k_packs_epi64(v[22], v[23]); - u[12] = k_packs_epi64(v[24], v[25]); - u[13] = k_packs_epi64(v[26], v[27]); - u[14] = k_packs_epi64(v[28], v[29]); - u[15] = k_packs_epi64(v[30], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - v[0] = _mm_cmplt_epi32(u[0], kZero); - v[1] = _mm_cmplt_epi32(u[1], kZero); - v[2] = _mm_cmplt_epi32(u[2], kZero); - v[3] = _mm_cmplt_epi32(u[3], kZero); - v[4] = _mm_cmplt_epi32(u[4], kZero); - v[5] = _mm_cmplt_epi32(u[5], kZero); - v[6] = _mm_cmplt_epi32(u[6], kZero); - v[7] = _mm_cmplt_epi32(u[7], kZero); - v[8] = _mm_cmplt_epi32(u[8], kZero); - v[9] = _mm_cmplt_epi32(u[9], kZero); - v[10] = _mm_cmplt_epi32(u[10], kZero); - v[11] = _mm_cmplt_epi32(u[11], kZero); - v[12] = _mm_cmplt_epi32(u[12], kZero); - v[13] = _mm_cmplt_epi32(u[13], kZero); - v[14] = _mm_cmplt_epi32(u[14], kZero); - v[15] = _mm_cmplt_epi32(u[15], kZero); - - u[0] = _mm_sub_epi32(u[0], v[0]); - u[1] = _mm_sub_epi32(u[1], v[1]); - u[2] = _mm_sub_epi32(u[2], v[2]); - u[3] = _mm_sub_epi32(u[3], v[3]); - u[4] = _mm_sub_epi32(u[4], v[4]); - u[5] = _mm_sub_epi32(u[5], v[5]); - u[6] = _mm_sub_epi32(u[6], v[6]); - u[7] = _mm_sub_epi32(u[7], v[7]); - u[8] = _mm_sub_epi32(u[8], v[8]); - u[9] = _mm_sub_epi32(u[9], v[9]); - u[10] = _mm_sub_epi32(u[10], v[10]); - u[11] = _mm_sub_epi32(u[11], v[11]); - u[12] = _mm_sub_epi32(u[12], v[12]); - u[13] = _mm_sub_epi32(u[13], v[13]); - u[14] = _mm_sub_epi32(u[14], v[14]); - u[15] = _mm_sub_epi32(u[15], v[15]); - - v[0] = _mm_add_epi32(u[0], K32One); - v[1] = _mm_add_epi32(u[1], K32One); - v[2] = _mm_add_epi32(u[2], K32One); - v[3] = _mm_add_epi32(u[3], K32One); - v[4] = _mm_add_epi32(u[4], K32One); - v[5] = _mm_add_epi32(u[5], K32One); - v[6] = _mm_add_epi32(u[6], K32One); - v[7] = _mm_add_epi32(u[7], K32One); - v[8] = _mm_add_epi32(u[8], K32One); - v[9] = _mm_add_epi32(u[9], K32One); - v[10] = _mm_add_epi32(u[10], K32One); - v[11] = _mm_add_epi32(u[11], K32One); - v[12] = _mm_add_epi32(u[12], K32One); - v[13] = _mm_add_epi32(u[13], K32One); - v[14] = _mm_add_epi32(u[14], K32One); - v[15] = _mm_add_epi32(u[15], K32One); - - u[0] = _mm_srai_epi32(v[0], 2); - u[1] = _mm_srai_epi32(v[1], 2); - u[2] = _mm_srai_epi32(v[2], 2); - u[3] = _mm_srai_epi32(v[3], 2); - u[4] = _mm_srai_epi32(v[4], 2); - u[5] = _mm_srai_epi32(v[5], 2); - u[6] = _mm_srai_epi32(v[6], 2); - u[7] = _mm_srai_epi32(v[7], 2); - u[8] = _mm_srai_epi32(v[8], 2); - u[9] = _mm_srai_epi32(v[9], 2); - u[10] = _mm_srai_epi32(v[10], 2); - u[11] = _mm_srai_epi32(v[11], 2); - u[12] = _mm_srai_epi32(v[12], 2); - u[13] = _mm_srai_epi32(v[13], 2); - u[14] = _mm_srai_epi32(v[14], 2); - u[15] = _mm_srai_epi32(v[15], 2); - - out[5] = _mm_packs_epi32(u[0], u[1]); - out[21] = _mm_packs_epi32(u[2], u[3]); - out[13] = _mm_packs_epi32(u[4], u[5]); - out[29] = _mm_packs_epi32(u[6], u[7]); - out[3] = _mm_packs_epi32(u[8], u[9]); - out[19] = _mm_packs_epi32(u[10], u[11]); - out[11] = _mm_packs_epi32(u[12], u[13]); - out[27] = _mm_packs_epi32(u[14], u[15]); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29], - &out[3], &out[19], &out[11], &out[27]); - if (overflow) { - HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } -#endif // FDCT32x32_HIGH_PRECISION - // Transpose the results, do it as four 8x8 transposes. - { - int transpose_block; - int16_t *output0 = &intermediate[column_start * 32]; - tran_low_t *output1 = &output_org[column_start * 32]; - for (transpose_block = 0; transpose_block < 4; ++transpose_block) { - __m128i *this_out = &out[8 * transpose_block]; - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - if (0 == pass) { - // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; - // TODO(cd): see quality impact of only doing - // output[j] = (output[j] + 1) >> 2; - // which would remove the code between here ... - __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); - __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); - __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); - __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); - __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); - __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); - __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); - __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); - tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); - tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); - tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); - tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); - tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); - tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); - tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); - tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); - // ... and here. - // PS: also change code in av1/encoder/av1_dct.c - tr2_0 = _mm_add_epi16(tr2_0, kOne); - tr2_1 = _mm_add_epi16(tr2_1, kOne); - tr2_2 = _mm_add_epi16(tr2_2, kOne); - tr2_3 = _mm_add_epi16(tr2_3, kOne); - tr2_4 = _mm_add_epi16(tr2_4, kOne); - tr2_5 = _mm_add_epi16(tr2_5, kOne); - tr2_6 = _mm_add_epi16(tr2_6, kOne); - tr2_7 = _mm_add_epi16(tr2_7, kOne); - tr2_0 = _mm_srai_epi16(tr2_0, 2); - tr2_1 = _mm_srai_epi16(tr2_1, 2); - tr2_2 = _mm_srai_epi16(tr2_2, 2); - tr2_3 = _mm_srai_epi16(tr2_3, 2); - tr2_4 = _mm_srai_epi16(tr2_4, 2); - tr2_5 = _mm_srai_epi16(tr2_5, 2); - tr2_6 = _mm_srai_epi16(tr2_6, 2); - tr2_7 = _mm_srai_epi16(tr2_7, 2); - } - // Note: even though all these stores are aligned, using the aligned - // intrinsic make the code slightly slower. - if (pass == 0) { - _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0); - _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1); - _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2); - _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3); - _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4); - _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5); - _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6); - _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7); - // Process next 8x8 - output0 += 8; - } else { - storeu_output(&tr2_0, (output1 + 0 * 32)); - storeu_output(&tr2_1, (output1 + 1 * 32)); - storeu_output(&tr2_2, (output1 + 2 * 32)); - storeu_output(&tr2_3, (output1 + 3 * 32)); - storeu_output(&tr2_4, (output1 + 4 * 32)); - storeu_output(&tr2_5, (output1 + 5 * 32)); - storeu_output(&tr2_6, (output1 + 6 * 32)); - storeu_output(&tr2_7, (output1 + 7 * 32)); - // Process next 8x8 - output1 += 8; - } - } - } - } - } -} // NOLINT - -#undef ADD_EPI16 -#undef SUB_EPI16 -#undef HIGH_FDCT32x32_2D_C -#undef HIGH_FDCT32x32_2D_ROWS_C diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c deleted file mode 100644 index 670f864d0..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.c +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_config.h" - -#define FDCT32x32_2D_AVX2 aom_fdct32x32_rd_avx2 -#define FDCT32x32_HIGH_PRECISION 0 -#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" -#undef FDCT32x32_2D_AVX2 -#undef FDCT32x32_HIGH_PRECISION - -#define FDCT32x32_2D_AVX2 aom_fdct32x32_avx2 -#define FDCT32x32_HIGH_PRECISION 1 -#include "aom_dsp/x86/fwd_dct32x32_impl_avx2.h" // NOLINT -#undef FDCT32x32_2D_AVX2 -#undef FDCT32x32_HIGH_PRECISION diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h deleted file mode 100644 index 86df4a6f6..000000000 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_avx2.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H -#define AOM_DSP_X86_FWD_TXFM_AVX2_H - -#include "./aom_config.h" - -static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) { - if (sizeof(tran_low_t) == 4) { - const __m256i zero = _mm256_setzero_si256(); - const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); - - __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); - __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); - - __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); - __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); - - _mm256_storeu_si256((__m256i *)out, y0); - _mm256_storeu_si256((__m256i *)(out + 8), y1); - } else { - _mm256_storeu_si256((__m256i *)out, *coeff); - } -} - -#endif // AOM_DSP_X86_FWD_TXFM_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h index 7bb1db70a..1e3d13ec8 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h @@ -11,7 +11,8 @@ #include // SSE2 -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/txfm_common.h" #include "aom_dsp/x86/fwd_txfm_sse2.h" #include "aom_dsp/x86/txfm_common_sse2.h" @@ -29,233 +30,6 @@ #define SUB_EPI16 _mm_sub_epi16 #endif -void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { - // This 2D transform implements 4 vertical 1D transforms followed - // by 4 horizontal 1D transforms. The multiplies and adds are as given - // by Chen, Smith and Fralick ('77). The commands for moving the data - // around have been minimized by hand. - // For the purposes of the comments, the 16 inputs are referred to at i0 - // through iF (in raster order), intermediate variables are a0, b0, c0 - // through f, and correspond to the in-place computations mapped to input - // locations. The outputs, o0 through oF are labeled according to the - // output locations. - - // Constants - // These are the coefficients used for the multiplies. - // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64), - // where cospi_N_64 = cos(N pi /64) - const __m128i k__cospi_A = - octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, - cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); - const __m128i k__cospi_B = - octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, - cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); - const __m128i k__cospi_C = - octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, - cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64); - const __m128i k__cospi_D = - octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, - cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64); - const __m128i k__cospi_E = - octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64, - cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64); - const __m128i k__cospi_F = - octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64, - cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64); - const __m128i k__cospi_G = - octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64, - -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64); - const __m128i k__cospi_H = - octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64, - -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64); - - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - // This second rounding constant saves doing some extra adds at the end - const __m128i k__DCT_CONST_ROUNDING2 = - _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1)); - const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2; - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - __m128i in0, in1; -#if DCT_HIGH_BIT_DEPTH - __m128i cmp0, cmp1; - int test, overflow; -#endif - - // Load inputs. - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - // in0 = [i0 i1 i2 i3 iC iD iE iF] - // in1 = [i4 i5 i6 i7 i8 i9 iA iB] - in1 = _mm_unpacklo_epi64( - in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); - in0 = _mm_unpacklo_epi64( - in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); -#if DCT_HIGH_BIT_DEPTH - // Check inputs small enough to use optimised code - cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)), - _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00))); - cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)), - _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00))); - test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1)); - if (test) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - - // multiply by 16 to give some extra precision - in0 = _mm_slli_epi16(in0, 4); - in1 = _mm_slli_epi16(in1, 4); - // if (i == 0 && input[0]) input[0] += 1; - // add 1 to the upper left pixel if it is non-zero, which helps reduce - // the round-trip error - { - // The mask will only contain whether the first value is zero, all - // other comparison will fail as something shifted by 4 (above << 4) - // can never be equal to one. To increment in the non-zero case, we - // add the mask and one for the first element: - // - if zero, mask = -1, v = v - 1 + 1 = v - // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 - __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); - in0 = _mm_add_epi16(in0, mask); - in0 = _mm_add_epi16(in0, k__nonzero_bias_b); - } - // There are 4 total stages, alternating between an add/subtract stage - // followed by an multiply-and-add stage. - { - // Stage 1: Add/subtract - - // in0 = [i0 i1 i2 i3 iC iD iE iF] - // in1 = [i4 i5 i6 i7 i8 i9 iA iB] - const __m128i r0 = _mm_unpacklo_epi16(in0, in1); - const __m128i r1 = _mm_unpackhi_epi16(in0, in1); - // r0 = [i0 i4 i1 i5 i2 i6 i3 i7] - // r1 = [iC i8 iD i9 iE iA iF iB] - const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4); - const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4); - // r2 = [i0 i4 i1 i5 i3 i7 i2 i6] - // r3 = [iC i8 iD i9 iF iB iE iA] - - const __m128i t0 = _mm_add_epi16(r2, r3); - const __m128i t1 = _mm_sub_epi16(r2, r3); - // t0 = [a0 a4 a1 a5 a3 a7 a2 a6] - // t1 = [aC a8 aD a9 aF aB aE aA] - - // Stage 2: multiply by constants (which gets us into 32 bits). - // The constants needed here are: - // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16] - // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16] - // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08] - // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24] - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D); - // Then add and right-shift to get back to 16-bit range - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // w0 = [b0 b1 b7 b6] - // w1 = [b8 b9 bF bE] - // w2 = [b4 b5 b3 b2] - // w3 = [bC bD bB bA] - const __m128i x0 = _mm_packs_epi32(w0, w1); - const __m128i x1 = _mm_packs_epi32(w2, w3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&x0, &x1); - if (overflow) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // x0 = [b0 b1 b7 b6 b8 b9 bF bE] - // x1 = [b4 b5 b3 b2 bC bD bB bA] - in0 = _mm_shuffle_epi32(x0, 0xD8); - in1 = _mm_shuffle_epi32(x1, 0x8D); - // in0 = [b0 b1 b8 b9 b7 b6 bF bE] - // in1 = [b3 b2 bB bA b4 b5 bC bD] - } - { - // vertical DCTs finished. Now we do the horizontal DCTs. - // Stage 3: Add/subtract - - // t0 = [c0 c1 c8 c9 c4 c5 cC cD] - // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE] - const __m128i t0 = ADD_EPI16(in0, in1); - const __m128i t1 = SUB_EPI16(in0, in1); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&t0, &t1); - if (overflow) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - - // Stage 4: multiply by constants (which gets us into 32 bits). - { - // The constants needed here are: - // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16] - // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16] - // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24] - // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08] - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E); - const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F); - const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H); - // Then add and right-shift to get back to 16-bit range - // but this combines the final right-shift as well to save operations - // This unusual rounding operations is to maintain bit-accurate - // compatibility with the c version of this function which has two - // rounding steps in a row. - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2); - // w0 = [o0 o4 o8 oC] - // w1 = [o2 o6 oA oE] - // w2 = [o1 o5 o9 oD] - // w3 = [o3 o7 oB oF] - // remember the o's are numbered according to the correct output location - const __m128i x0 = _mm_packs_epi32(w0, w1); - const __m128i x1 = _mm_packs_epi32(w2, w3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&x0, &x1); - if (overflow) { - aom_highbd_fdct4x4_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - { - // x0 = [o0 o4 o8 oC o2 o6 oA oE] - // x1 = [o1 o5 o9 oD o3 o7 oB oF] - const __m128i y0 = _mm_unpacklo_epi16(x0, x1); - const __m128i y1 = _mm_unpackhi_epi16(x0, x1); - // y0 = [o0 o1 o4 o5 o8 o9 oC oD] - // y1 = [o2 o3 o6 o7 oA oB oE oF] - in0 = _mm_unpacklo_epi32(y0, y1); - // in0 = [o0 o1 o2 o3 o4 o5 o6 o7] - in1 = _mm_unpackhi_epi32(y0, y1); - // in1 = [o8 o9 oA oB oC oD oE oF] - } - } - } - // Post-condition (v + 1) >> 2 is now incorporated into previous - // add and right-shift commands. Only 2 store instructions needed - // because we are using the fact that 1/3 are stored just after 0/2. - storeu_output(&in0, output + 0 * 4); - storeu_output(&in1, output + 2 * 4); -} - void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { int pass; // Constants @@ -566,449 +340,5 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { } } -void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED(16, int16_t, intermediate[256]); - const int16_t *in = input; - int16_t *out0 = intermediate; - tran_low_t *out1 = output; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kOne = _mm_set1_epi16(1); - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - // We process eight columns (transposed rows in second pass) at a time. - int column_start; -#if DCT_HIGH_BIT_DEPTH - int overflow; -#endif - for (column_start = 0; column_start < 16; column_start += 8) { - __m128i in00, in01, in02, in03, in04, in05, in06, in07; - __m128i in08, in09, in10, in11, in12, in13, in14, in15; - __m128i input0, input1, input2, input3, input4, input5, input6, input7; - __m128i step1_0, step1_1, step1_2, step1_3; - __m128i step1_4, step1_5, step1_6, step1_7; - __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - __m128i step3_0, step3_1, step3_2, step3_3; - __m128i step3_4, step3_5, step3_6, step3_7; - __m128i res00, res01, res02, res03, res04, res05, res06, res07; - __m128i res08, res09, res10, res11, res12, res13, res14, res15; - // Load and pre-condition input. - if (0 == pass) { - in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); - // x = x << 2 - in00 = _mm_slli_epi16(in00, 2); - in01 = _mm_slli_epi16(in01, 2); - in02 = _mm_slli_epi16(in02, 2); - in03 = _mm_slli_epi16(in03, 2); - in04 = _mm_slli_epi16(in04, 2); - in05 = _mm_slli_epi16(in05, 2); - in06 = _mm_slli_epi16(in06, 2); - in07 = _mm_slli_epi16(in07, 2); - in08 = _mm_slli_epi16(in08, 2); - in09 = _mm_slli_epi16(in09, 2); - in10 = _mm_slli_epi16(in10, 2); - in11 = _mm_slli_epi16(in11, 2); - in12 = _mm_slli_epi16(in12, 2); - in13 = _mm_slli_epi16(in13, 2); - in14 = _mm_slli_epi16(in14, 2); - in15 = _mm_slli_epi16(in15, 2); - } else { - in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); - // x = (x + 1) >> 2 - in00 = _mm_add_epi16(in00, kOne); - in01 = _mm_add_epi16(in01, kOne); - in02 = _mm_add_epi16(in02, kOne); - in03 = _mm_add_epi16(in03, kOne); - in04 = _mm_add_epi16(in04, kOne); - in05 = _mm_add_epi16(in05, kOne); - in06 = _mm_add_epi16(in06, kOne); - in07 = _mm_add_epi16(in07, kOne); - in08 = _mm_add_epi16(in08, kOne); - in09 = _mm_add_epi16(in09, kOne); - in10 = _mm_add_epi16(in10, kOne); - in11 = _mm_add_epi16(in11, kOne); - in12 = _mm_add_epi16(in12, kOne); - in13 = _mm_add_epi16(in13, kOne); - in14 = _mm_add_epi16(in14, kOne); - in15 = _mm_add_epi16(in15, kOne); - in00 = _mm_srai_epi16(in00, 2); - in01 = _mm_srai_epi16(in01, 2); - in02 = _mm_srai_epi16(in02, 2); - in03 = _mm_srai_epi16(in03, 2); - in04 = _mm_srai_epi16(in04, 2); - in05 = _mm_srai_epi16(in05, 2); - in06 = _mm_srai_epi16(in06, 2); - in07 = _mm_srai_epi16(in07, 2); - in08 = _mm_srai_epi16(in08, 2); - in09 = _mm_srai_epi16(in09, 2); - in10 = _mm_srai_epi16(in10, 2); - in11 = _mm_srai_epi16(in11, 2); - in12 = _mm_srai_epi16(in12, 2); - in13 = _mm_srai_epi16(in13, 2); - in14 = _mm_srai_epi16(in14, 2); - in15 = _mm_srai_epi16(in15, 2); - } - in += 8; - // Calculate input for the first 8 results. - { - input0 = ADD_EPI16(in00, in15); - input1 = ADD_EPI16(in01, in14); - input2 = ADD_EPI16(in02, in13); - input3 = ADD_EPI16(in03, in12); - input4 = ADD_EPI16(in04, in11); - input5 = ADD_EPI16(in05, in10); - input6 = ADD_EPI16(in06, in09); - input7 = ADD_EPI16(in07, in08); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3, - &input4, &input5, &input6, &input7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Calculate input for the next 8 results. - { - step1_0 = SUB_EPI16(in07, in08); - step1_1 = SUB_EPI16(in06, in09); - step1_2 = SUB_EPI16(in05, in10); - step1_3 = SUB_EPI16(in04, in11); - step1_4 = SUB_EPI16(in03, in12); - step1_5 = SUB_EPI16(in02, in13); - step1_6 = SUB_EPI16(in01, in14); - step1_7 = SUB_EPI16(in00, in15); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, - &step1_4, &step1_5, &step1_6, &step1_7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // Work on the first eight values; fdct8(input, even_results); - { - // Add/subtract - const __m128i q0 = ADD_EPI16(input0, input7); - const __m128i q1 = ADD_EPI16(input1, input6); - const __m128i q2 = ADD_EPI16(input2, input5); - const __m128i q3 = ADD_EPI16(input3, input4); - const __m128i q4 = SUB_EPI16(input3, input4); - const __m128i q5 = SUB_EPI16(input2, input5); - const __m128i q6 = SUB_EPI16(input1, input6); - const __m128i q7 = SUB_EPI16(input0, input7); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Work on first four results - { - // Add/subtract - const __m128i r0 = ADD_EPI16(q0, q3); - const __m128i r1 = ADD_EPI16(q1, q2); - const __m128i r2 = SUB_EPI16(q1, q2); - const __m128i r3 = SUB_EPI16(q0, q3); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Interleave to do the multiply by constants which gets us - // into 32 bits. - { - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i r0 = - mult_round_shift(&d0, &d1, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - const __m128i r1 = - mult_round_shift(&d0, &d1, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x2(&r0, &r1); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - { - // Add/subtract - const __m128i x0 = ADD_EPI16(q4, r0); - const __m128i x1 = SUB_EPI16(q4, r0); - const __m128i x2 = SUB_EPI16(q7, r1); - const __m128i x3 = ADD_EPI16(q7, r1); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - // Interleave to do the multiply by constants which gets us - // into 32 bits. - { - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&res02, &res14, &res10, &res06); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - } - } - // Work on the next eight values; step1 -> odd_results - { - // step 2 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); - const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); - const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); - const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); - step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 3 - { - step3_0 = ADD_EPI16(step1_0, step2_3); - step3_1 = ADD_EPI16(step1_1, step2_2); - step3_2 = SUB_EPI16(step1_1, step2_2); - step3_3 = SUB_EPI16(step1_0, step2_3); - step3_4 = SUB_EPI16(step1_7, step2_4); - step3_5 = SUB_EPI16(step1_6, step2_5); - step3_6 = ADD_EPI16(step1_6, step2_5); - step3_7 = ADD_EPI16(step1_7, step2_4); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3, - &step3_4, &step3_5, &step3_6, &step3_7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 4 - { - const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); - const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); - const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); - const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); - step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 5 - { - step1_0 = ADD_EPI16(step3_0, step2_1); - step1_1 = SUB_EPI16(step3_0, step2_1); - step1_2 = ADD_EPI16(step3_3, step2_2); - step1_3 = SUB_EPI16(step3_3, step2_2); - step1_4 = SUB_EPI16(step3_4, step2_5); - step1_5 = ADD_EPI16(step3_4, step2_5); - step1_6 = SUB_EPI16(step3_7, step2_6); - step1_7 = ADD_EPI16(step3_7, step2_6); -#if DCT_HIGH_BIT_DEPTH - overflow = - check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3, - &step1_4, &step1_5, &step1_6, &step1_7); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - // step 6 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); - const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); - const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); - const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); - res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); - const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); - const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); - const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); - res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); - res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06, - &k__DCT_CONST_ROUNDING, DCT_CONST_BITS); -#if DCT_HIGH_BIT_DEPTH - overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03); - if (overflow) { - aom_highbd_fdct16x16_c(input, output, stride); - return; - } -#endif // DCT_HIGH_BIT_DEPTH - } - } - // Transpose the results, do it as two 8x8 transposes. - transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05, - &res06, &res07, pass, out0, out1); - transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13, - &res14, &res15, pass, out0 + 8, out1 + 8); - if (pass == 0) { - out0 += 8 * 16; - } else { - out1 += 8 * 16; - } - } - // Setup in/out for next pass. - in = intermediate; - } -} - #undef ADD_EPI16 #undef SUB_EPI16 diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c index 657dcfa22..2d8f8f71e 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c @@ -11,40 +11,12 @@ #include // SSE2 -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/x86/fwd_txfm_sse2.h" -void aom_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { - __m128i in0, in1; - __m128i tmp; - const __m128i zero = _mm_setzero_si128(); - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in1 = _mm_unpacklo_epi64( - in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); - in0 = _mm_unpacklo_epi64( - in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); - - tmp = _mm_add_epi16(in0, in1); - in0 = _mm_unpacklo_epi16(zero, tmp); - in1 = _mm_unpackhi_epi16(zero, tmp); - in0 = _mm_srai_epi32(in0, 16); - in1 = _mm_srai_epi32(in1, 16); - - tmp = _mm_add_epi32(in0, in1); - in0 = _mm_unpacklo_epi32(tmp, zero); - in1 = _mm_unpackhi_epi32(tmp, zero); - - tmp = _mm_add_epi32(in0, in1); - in0 = _mm_srli_si128(tmp, 8); - - in1 = _mm_add_epi32(tmp, in0); - in0 = _mm_slli_epi32(in1, 1); - output[0] = (tran_low_t)_mm_cvtsi128_si32(in0); -} - void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); @@ -86,47 +58,12 @@ void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) { } #define DCT_HIGH_BIT_DEPTH 0 -#define FDCT4x4_2D aom_fdct4x4_sse2 #define FDCT8x8_2D aom_fdct8x8_sse2 -#define FDCT16x16_2D aom_fdct16x16_sse2 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h" -#undef FDCT4x4_2D #undef FDCT8x8_2D -#undef FDCT16x16_2D -#define FDCT32x32_2D aom_fdct32x32_rd_sse2 -#define FDCT32x32_HIGH_PRECISION 0 -#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION - -#define FDCT32x32_2D aom_fdct32x32_sse2 -#define FDCT32x32_HIGH_PRECISION 1 -#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION #undef DCT_HIGH_BIT_DEPTH - -#if CONFIG_HIGHBITDEPTH #define DCT_HIGH_BIT_DEPTH 1 -#define FDCT4x4_2D aom_highbd_fdct4x4_sse2 #define FDCT8x8_2D aom_highbd_fdct8x8_sse2 -#define FDCT16x16_2D aom_highbd_fdct16x16_sse2 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT -#undef FDCT4x4_2D #undef FDCT8x8_2D -#undef FDCT16x16_2D - -#define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2 -#define FDCT32x32_HIGH_PRECISION 0 -#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION - -#define FDCT32x32_2D aom_highbd_fdct32x32_sse2 -#define FDCT32x32_HIGH_PRECISION 1 -#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT -#undef FDCT32x32_2D -#undef FDCT32x32_HIGH_PRECISION -#undef DCT_HIGH_BIT_DEPTH -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h index 58e8971dd..12ccf7f26 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -12,15 +12,10 @@ #ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_ #define AOM_DSP_X86_FWD_TXFM_SSE2_H_ -#include "aom_dsp/x86/txfm_common_intrin.h" - #ifdef __cplusplus extern "C" { #endif -#define pair_set_epi32(a, b) \ - _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a)) - static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { __m128i buf0, buf1; buf0 = _mm_mul_epu32(a, b); @@ -140,112 +135,6 @@ static INLINE int check_epi16_overflow_x32( return res0 + res1; } -static INLINE int k_check_epi32_overflow_4(const __m128i *preg0, - const __m128i *preg1, - const __m128i *preg2, - const __m128i *preg3, - const __m128i *zero) { - __m128i minus_one = _mm_set1_epi32(-1); - // Check for overflows - __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1); - __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1); - __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1); - __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1); - __m128i reg0_top_dwords = - _mm_shuffle_epi32(reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg1_top_dwords = - _mm_shuffle_epi32(reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg2_top_dwords = - _mm_shuffle_epi32(reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i reg3_top_dwords = - _mm_shuffle_epi32(reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1)); - __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords); - __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords); - __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero); - __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero); - __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one); - __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one); - int overflow_01 = - _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_01, valid_negative_01)); - int overflow_23 = - _mm_movemask_epi8(_mm_cmpeq_epi32(valid_positve_23, valid_negative_23)); - return (overflow_01 + overflow_23); -} - -static INLINE int k_check_epi32_overflow_8( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *zero) { - int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); - } - return overflow; -} - -static INLINE int k_check_epi32_overflow_16( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, - const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, - const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, - const __m128i *preg15, const __m128i *zero) { - int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); - } - } - } - return overflow; -} - -static INLINE int k_check_epi32_overflow_32( - const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, - const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, - const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, - const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, - const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, - const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, - const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, - const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, - const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, - const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, - const __m128i *preg30, const __m128i *preg31, const __m128i *zero) { - int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg12, preg13, preg14, preg15, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg16, preg17, preg18, preg19, zero); - if (!overflow) { - overflow = - k_check_epi32_overflow_4(preg20, preg21, preg22, preg23, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg24, preg25, preg26, - preg27, zero); - if (!overflow) { - overflow = k_check_epi32_overflow_4(preg28, preg29, preg30, - preg31, zero); - } - } - } - } - } - } - } - return overflow; -} - static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { if (sizeof(tran_low_t) == 4) { const __m128i zero = _mm_setzero_si128(); @@ -259,102 +148,6 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { } } -static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1, - const __m128i *pmultiplier, - const __m128i *prounding, int shift) { - const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier); - const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier); - const __m128i v0 = _mm_add_epi32(u0, *prounding); - const __m128i v1 = _mm_add_epi32(u1, *prounding); - const __m128i w0 = _mm_srai_epi32(v0, shift); - const __m128i w1 = _mm_srai_epi32(v1, shift); - return _mm_packs_epi32(w0, w1); -} - -static INLINE void transpose_and_output8x8( - const __m128i *pin00, const __m128i *pin01, const __m128i *pin02, - const __m128i *pin03, const __m128i *pin04, const __m128i *pin05, - const __m128i *pin06, const __m128i *pin07, int pass, int16_t *out0_ptr, - tran_low_t *out1_ptr) { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01); - const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03); - const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01); - const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03); - const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05); - const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07); - const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05); - const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - if (pass == 0) { - _mm_storeu_si128((__m128i *)(out0_ptr + 0 * 16), tr2_0); - _mm_storeu_si128((__m128i *)(out0_ptr + 1 * 16), tr2_1); - _mm_storeu_si128((__m128i *)(out0_ptr + 2 * 16), tr2_2); - _mm_storeu_si128((__m128i *)(out0_ptr + 3 * 16), tr2_3); - _mm_storeu_si128((__m128i *)(out0_ptr + 4 * 16), tr2_4); - _mm_storeu_si128((__m128i *)(out0_ptr + 5 * 16), tr2_5); - _mm_storeu_si128((__m128i *)(out0_ptr + 6 * 16), tr2_6); - _mm_storeu_si128((__m128i *)(out0_ptr + 7 * 16), tr2_7); - } else { - storeu_output(&tr2_0, (out1_ptr + 0 * 16)); - storeu_output(&tr2_1, (out1_ptr + 1 * 16)); - storeu_output(&tr2_2, (out1_ptr + 2 * 16)); - storeu_output(&tr2_3, (out1_ptr + 3 * 16)); - storeu_output(&tr2_4, (out1_ptr + 4 * 16)); - storeu_output(&tr2_5, (out1_ptr + 5 * 16)); - storeu_output(&tr2_6, (out1_ptr + 6 * 16)); - storeu_output(&tr2_7, (out1_ptr + 7 * 16)); - } -} - -void fdct32_8col(__m128i *in0, __m128i *in1); - #ifdef __cplusplus } // extern "C" #endif diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm index 8fa1c04d0..c1fb259a1 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -13,10 +13,6 @@ %include "third_party/x86inc/x86inc.asm" -; This file provides SSSE3 version of the forward transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. - SECTION_RODATA pw_11585x2: times 8 dw 23170 @@ -32,106 +28,7 @@ TRANSFORM_COEFFS 15137, 6270 TRANSFORM_COEFFS 16069, 3196 TRANSFORM_COEFFS 9102, 13623 -SECTION .text - -%if ARCH_X86_64 -%macro SUM_SUB 3 - psubw m%3, m%1, m%2 - paddw m%1, m%2 - SWAP %2, %3 -%endmacro - -; butterfly operation -%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 - pmaddwd m%1, m%3, %5 - pmaddwd m%2, m%3, %6 - paddd m%1, %4 - paddd m%2, %4 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -; 1D forward 8x8 DCT transform -%macro FDCT8_1D 1 - SUM_SUB 0, 7, 9 - SUM_SUB 1, 6, 9 - SUM_SUB 2, 5, 9 - SUM_SUB 3, 4, 9 - - SUM_SUB 0, 3, 9 - SUM_SUB 1, 2, 9 - SUM_SUB 6, 5, 9 -%if %1 == 0 - SUM_SUB 0, 1, 9 -%endif - - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 - - pmulhrsw m6, m12 - pmulhrsw m5, m12 -%if %1 == 0 - pmulhrsw m0, m12 - pmulhrsw m1, m12 -%else - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 - SWAP 0, 1 -%endif - - SUM_SUB 4, 5, 9 - SUM_SUB 7, 6, 9 - BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10 - BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10 - SWAP 1, 4 - SWAP 3, 6 -%endmacro - -%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2 - psraw m%3, m%1, 15 - psraw m%4, m%2, 15 - psubw m%1, m%3 - psubw m%2, m%4 - psraw m%1, 1 - psraw m%2, 1 -%endmacro - %macro STORE_OUTPUT 2 ; index, result -%if CONFIG_HIGHBITDEPTH ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); @@ -144,16 +41,16 @@ SECTION .text punpckhwd m12, m11 mova [outputq + 4*%1 + 0], m%2 mova [outputq + 4*%1 + 16], m12 -%else - mova [outputq + 2*%1], m%2 -%endif %endmacro +SECTION .text + +%if ARCH_X86_64 INIT_XMM ssse3 cglobal fdct8x8, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m12, [pw_11585x2] + mova m8, [GLOBAL(pd_8192)] + mova m12, [GLOBAL(pw_11585x2)] lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -180,25 +77,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride psllw m7, 2 ; column transform - FDCT8_1D 0 - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - FDCT8_1D 1 - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - DIVIDE_ROUND_2X 0, 1, 9, 10 - DIVIDE_ROUND_2X 2, 3, 9, 10 - DIVIDE_ROUND_2X 4, 5, 9, 10 - DIVIDE_ROUND_2X 6, 7, 9, 10 - - STORE_OUTPUT 0, 0 - STORE_OUTPUT 8, 1 - STORE_OUTPUT 16, 2 - STORE_OUTPUT 24, 3 - STORE_OUTPUT 32, 4 - STORE_OUTPUT 40, 5 - STORE_OUTPUT 48, 6 - STORE_OUTPUT 56, 7 + ; stage 1 + paddw m10, m0, m7 + psubw m0, m7 + + paddw m9, m1, m6 + psubw m1, m6 + + paddw m7, m2, m5 + psubw m2, m5 + + paddw m6, m3, m4 + psubw m3, m4 + + ; stage 2 + paddw m5, m9, m7 + psubw m9, m7 + + paddw m4, m10, m6 + psubw m10, m6 + + paddw m7, m1, m2 + psubw m1, m2 + + ; stage 3 + paddw m6, m4, m5 + psubw m4, m5 + + pmulhrsw m1, m12 + pmulhrsw m7, m12 + + ; sin(pi / 8), cos(pi / 8) + punpcklwd m2, m10, m9 + punpckhwd m10, m9 + pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] + pmaddwd m2, [GLOBAL(pw_6270_m15137)] + pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] + pmaddwd m10, [GLOBAL(pw_6270_m15137)] + paddd m5, m8 + paddd m2, m8 + paddd m9, m8 + paddd m10, m8 + psrad m5, 14 + psrad m2, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m5, m9 + packssdw m2, m10 + + pmulhrsw m6, m12 + pmulhrsw m4, m12 + + paddw m9, m3, m1 + psubw m3, m1 + + paddw m10, m0, m7 + psubw m0, m7 + + ; stage 4 + ; sin(pi / 16), cos(pi / 16) + punpcklwd m1, m10, m9 + punpckhwd m10, m9 + pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] + pmaddwd m1, [GLOBAL(pw_3196_m16069)] + pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] + pmaddwd m10, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m1, m8 + paddd m9, m8 + paddd m10, m8 + psrad m7, 14 + psrad m1, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m7, m9 + packssdw m1, m10 + + ; sin(3 * pi / 16), cos(3 * pi / 16) + punpcklwd m11, m0, m3 + punpckhwd m0, m3 + pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] + pmaddwd m11, [GLOBAL(pw_13623_m9102)] + pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] + pmaddwd m0, [GLOBAL(pw_13623_m9102)] + paddd m9, m8 + paddd m11, m8 + paddd m3, m8 + paddd m0, m8 + psrad m9, 14 + psrad m11, 14 + psrad m3, 14 + psrad m0, 14 + packssdw m9, m3 + packssdw m11, m0 + + ; transpose + ; stage 1 + punpcklwd m0, m6, m7 + punpcklwd m3, m5, m11 + punpckhwd m6, m7 + punpckhwd m5, m11 + punpcklwd m7, m4, m9 + punpcklwd m10, m2, m1 + punpckhwd m4, m9 + punpckhwd m2, m1 + + ; stage 2 + punpckldq m9, m0, m3 + punpckldq m1, m6, m5 + punpckhdq m0, m3 + punpckhdq m6, m5 + punpckldq m3, m7, m10 + punpckldq m5, m4, m2 + punpckhdq m7, m10 + punpckhdq m4, m2 + + ; stage 3 + punpcklqdq m10, m9, m3 + punpckhqdq m9, m3 + punpcklqdq m2, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m7, m6, m4 + punpckhqdq m6, m4 + + ; row transform + ; stage 1 + paddw m5, m10, m6 + psubw m10, m6 + + paddw m4, m9, m7 + psubw m9, m7 + + paddw m6, m2, m1 + psubw m2, m1 + + paddw m7, m0, m3 + psubw m0, m3 + + ;stage 2 + paddw m1, m5, m7 + psubw m5, m7 + + paddw m3, m4, m6 + psubw m4, m6 + + paddw m7, m9, m2 + psubw m9, m2 + + ; stage 3 + punpcklwd m6, m1, m3 + punpckhwd m1, m3 + pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] + pmaddwd m6, [GLOBAL(pw_11585_m11585)] + pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] + pmaddwd m1, [GLOBAL(pw_11585_m11585)] + paddd m2, m8 + paddd m6, m8 + paddd m3, m8 + paddd m1, m8 + psrad m2, 14 + psrad m6, 14 + psrad m3, 14 + psrad m1, 14 + packssdw m2, m3 + packssdw m6, m1 + + pmulhrsw m7, m12 + pmulhrsw m9, m12 + + punpcklwd m3, m5, m4 + punpckhwd m5, m4 + pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] + pmaddwd m3, [GLOBAL(pw_6270_m15137)] + pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] + pmaddwd m5, [GLOBAL(pw_6270_m15137)] + paddd m1, m8 + paddd m3, m8 + paddd m4, m8 + paddd m5, m8 + psrad m1, 14 + psrad m3, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m1, m4 + packssdw m3, m5 + + paddw m4, m0, m9 + psubw m0, m9 + + paddw m5, m10, m7 + psubw m10, m7 + + ; stage 4 + punpcklwd m9, m5, m4 + punpckhwd m5, m4 + pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] + pmaddwd m9, [GLOBAL(pw_3196_m16069)] + pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] + pmaddwd m5, [GLOBAL(pw_3196_m16069)] + paddd m7, m8 + paddd m9, m8 + paddd m4, m8 + paddd m5, m8 + psrad m7, 14 + psrad m9, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m7, m4 + packssdw m9, m5 + + punpcklwd m4, m10, m0 + punpckhwd m10, m0 + pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] + pmaddwd m4, [GLOBAL(pw_13623_m9102)] + pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] + pmaddwd m10, [GLOBAL(pw_13623_m9102)] + paddd m5, m8 + paddd m4, m8 + paddd m0, m8 + paddd m10, m8 + psrad m5, 14 + psrad m4, 14 + psrad m0, 14 + psrad m10, 14 + packssdw m5, m0 + packssdw m4, m10 + + ; transpose + ; stage 1 + punpcklwd m0, m2, m7 + punpcklwd m10, m1, m4 + punpckhwd m2, m7 + punpckhwd m1, m4 + punpcklwd m7, m6, m5 + punpcklwd m4, m3, m9 + punpckhwd m6, m5 + punpckhwd m3, m9 + + ; stage 2 + punpckldq m5, m0, m10 + punpckldq m9, m2, m1 + punpckhdq m0, m10 + punpckhdq m2, m1 + punpckldq m10, m7, m4 + punpckldq m1, m6, m3 + punpckhdq m7, m4 + punpckhdq m6, m3 + + ; stage 3 + punpcklqdq m4, m5, m10 + punpckhqdq m5, m10 + punpcklqdq m3, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m10, m9, m1 + punpckhqdq m9, m1 + punpcklqdq m7, m2, m6 + punpckhqdq m2, m6 + + psraw m1, m4, 15 + psraw m6, m5, 15 + psraw m8, m3, 15 + psraw m11, m0, 15 + + psubw m4, m1 + psubw m5, m6 + psubw m3, m8 + psubw m0, m11 + + psraw m4, 1 + psraw m5, 1 + psraw m3, 1 + psraw m0, 1 + + psraw m1, m10, 15 + psraw m6, m9, 15 + psraw m8, m7, 15 + psraw m11, m2, 15 + + psubw m10, m1 + psubw m9, m6 + psubw m7, m8 + psubw m2, m11 + + psraw m10, 1 + psraw m9, 1 + psraw m7, 1 + psraw m2, 1 + + STORE_OUTPUT 0, 4 + STORE_OUTPUT 8, 5 + STORE_OUTPUT 16, 3 + STORE_OUTPUT 24, 0 + STORE_OUTPUT 32, 10 + STORE_OUTPUT 40, 9 + STORE_OUTPUT 48, 7 + STORE_OUTPUT 56, 2 RET %endif diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm index 60446b086..99f17ebdf 100644 --- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm +++ b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm @@ -13,6 +13,8 @@ %include "aom_ports/x86_abi_support.asm" +SECTION .text + ;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref, ; int ref_stride, ; unsigned char *src, diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c index a99c0b40e..2a018c1cf 100644 --- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c @@ -11,8 +11,9 @@ #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref, diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c index 133640eb7..e5e3238d5 100644 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c @@ -11,8 +11,11 @@ #include #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/x86/convolve.h" +#include "aom_dsp/x86/convolve_avx2.h" +#include "aom_dsp/x86/synonyms.h" // ----------------------------------------------------------------------------- // Copy and average @@ -100,103 +103,258 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, } } -void aom_highbd_convolve_avg_avx2(const uint8_t *src8, ptrdiff_t src_stride, - uint8_t *dst8, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int width, int h, int bd) { - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - (void)bd; +void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + (void)filter_params_x; + (void)subpel_x_q4; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + __m256i s[8], coeffs_y[4]; + + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m256i src6; + __m256i s01 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + 0x20); + __m256i s12 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + 0x20); + __m256i s23 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + 0x20); + __m256i s34 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + 0x20); + __m256i s45 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + 0x20); + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + __m256i s56 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), + src6, 0x20); + + s[0] = _mm256_unpacklo_epi16(s01, s12); + s[1] = _mm256_unpacklo_epi16(s23, s34); + s[2] = _mm256_unpacklo_epi16(s45, s56); + + s[4] = _mm256_unpackhi_epi16(s01, s12); + s[5] = _mm256_unpackhi_epi16(s23, s34); + s[6] = _mm256_unpackhi_epi16(s45, s56); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + const __m256i s67 = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + + const __m256i s78 = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi16(s67, s78); + s[7] = _mm256_unpackhi_epi16(s67, s78); + + const __m256i res_a = convolve(s, coeffs_y); + + __m256i res_a_round = _mm256_sra_epi32( + _mm256_add_epi32(res_a, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m256i res_b = convolve(s + 4, coeffs_y); + __m256i res_b_round = _mm256_sra_epi32( + _mm256_add_epi32(res_b, round_const_bits), round_shift_bits); + + __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); + res_16bit = _mm256_min_epi16(res_16bit, clip_pixel); + res_16bit = _mm256_max_epi16(res_16bit, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_16bit)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_16bit, 1)); + } else if (w == 4) { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } else { + res_a_round = _mm256_packs_epi32(res_a_round, res_a_round); + res_a_round = _mm256_min_epi16(res_a_round, clip_pixel); + res_a_round = _mm256_max_epi16(res_a_round, zero); + + xx_storel_32((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res_a_round)); + xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res_a_round, 1)); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } + } + } +} - assert(width % 4 == 0); - if (width > 32) { // width = 64 - __m256i p0, p1, p2, p3, u0, u1, u2, u3; - do { - p0 = _mm256_loadu_si256((const __m256i *)src); - p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); - p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); - p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); - src += src_stride; - u0 = _mm256_loadu_si256((const __m256i *)dst); - u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); - u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); - u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); - _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); - _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); - _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); - _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); - dst += dst_stride; - h--; - } while (h > 0); - } else if (width > 16) { // width = 32 - __m256i p0, p1, u0, u1; - do { - p0 = _mm256_loadu_si256((const __m256i *)src); - p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); - src += src_stride; - u0 = _mm256_loadu_si256((const __m256i *)dst); - u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); - _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); - _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); - dst += dst_stride; - h--; - } while (h > 0); - } else if (width > 8) { // width = 16 - __m256i p0, p1, u0, u1; - do { - p0 = _mm256_loadu_si256((const __m256i *)src); - p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); - src += src_stride << 1; - u0 = _mm256_loadu_si256((const __m256i *)dst); - u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); - - _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); - _mm256_storeu_si256((__m256i *)(dst + dst_stride), - _mm256_avg_epu16(p1, u1)); - dst += dst_stride << 1; - h -= 2; - } while (h > 0); - } else if (width > 4) { // width = 8 - __m128i p0, p1, u0, u1; - do { - p0 = _mm_loadu_si128((const __m128i *)src); - p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); - src += src_stride << 1; - u0 = _mm_loadu_si128((const __m128i *)dst); - u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); - - _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); - _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); - dst += dst_stride << 1; - h -= 2; - } while (h > 0); - } else { // width = 4 - __m128i p0, p1, u0, u1; - do { - p0 = _mm_loadl_epi64((const __m128i *)src); - p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); - src += src_stride << 1; - u0 = _mm_loadl_epi64((const __m128i *)dst); - u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); - - _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); - _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); - dst += dst_stride << 1; - h -= 2; - } while (h > 0); +void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + (void)subpel_y_q4; + (void)filter_params_y; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m256i s[4], coeffs_x[4]; + + const __m256i round_const_x = + _mm256_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const int bits = FILTER_BITS - conv_params->round_0; + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1); + const __m256i clip_pixel = + _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m256i zero = _mm256_setzero_si256(); + + assert(bits >= 0); + assert((FILTER_BITS - conv_params->round_1) >= 0 || + ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); + + prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + for (i = 0; i < h; i += 2) { + const __m256i row0 = + _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]); + __m256i row1 = + _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]); + + const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20); + const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31); + + // even pixels + s[0] = _mm256_alignr_epi8(r1, r0, 0); + s[1] = _mm256_alignr_epi8(r1, r0, 4); + s[2] = _mm256_alignr_epi8(r1, r0, 8); + s[3] = _mm256_alignr_epi8(r1, r0, 12); + + __m256i res_even = convolve(s, coeffs_x); + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm256_alignr_epi8(r1, r0, 2); + s[1] = _mm256_alignr_epi8(r1, r0, 6); + s[2] = _mm256_alignr_epi8(r1, r0, 10); + s[3] = _mm256_alignr_epi8(r1, r0, 14); + + __m256i res_odd = convolve(s, coeffs_x); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x), + round_shift_x); + + res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits), + round_shift_bits); + res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m256i res_even1 = _mm256_packs_epi32(res_even, res_even); + __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd); + + __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1); + res = _mm256_min_epi16(res, clip_pixel); + res = _mm256_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } else { + xx_storel_32((__m128i *)&dst[i * dst_stride + j], + _mm256_castsi256_si128(res)); + xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride], + _mm256_extracti128_si256(res, 1)); + } + } } } +#define CONV8_ROUNDING_BITS (7) + // ----------------------------------------------------------------------------- // Horizontal and vertical filtering -#define CONV8_ROUNDING_BITS (7) - static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; @@ -817,250 +975,6 @@ static void aom_highbd_filter_block1d8_v2_avx2( } while (height > 0); } -// Calculation with averaging the input pixels - -static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask, - uint16_t *dst) { - const __m128i a0 = _mm256_castsi256_si128(*y0); - const __m128i a1 = _mm256_extractf128_si256(*y0, 1); - __m128i res = _mm_packus_epi32(a0, a1); - const __m128i pix = _mm_loadu_si128((const __m128i *)dst); - res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); - res = _mm_avg_epu16(res, pix); - _mm_storeu_si128((__m128i *)dst, res); -} - -static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - __m256i a = _mm256_packus_epi32(*y0, *y1); - const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); - const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); - const __m256i pix = - _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); - a = _mm256_min_epi16(a, *mask); - a = _mm256_avg_epu16(a, pix); - _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); - _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); -} - -static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst) { - __m256i a = _mm256_packus_epi32(*y0, *y1); - const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); - a = _mm256_min_epi16(a, *mask); - a = _mm256_avg_epu16(a, pix); - _mm256_storeu_si256((__m256i *)dst, a); -} - -static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, - const __m256i *mask, uint16_t *dst, - ptrdiff_t pitch) { - const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); - const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); - __m256i p = _mm256_min_epi16(*y0, *mask); - p = _mm256_avg_epu16(p, pix0); - _mm256_storeu_si256((__m256i *)dst, p); - - p = _mm256_min_epi16(*y1, *mask); - p = _mm256_avg_epu16(p, pix1); - _mm256_storeu_si256((__m256i *)(dst + pitch), p); -} - -static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0, - const __m128i *y1, - const __m128i *mask, - uint16_t *dst) { - __m128i res = _mm_packus_epi32(*y0, *y1); - const __m128i pix = _mm_loadu_si128((const __m128i *)dst); - res = _mm_min_epi16(res, *mask); - res = _mm_avg_epu16(res, pix); - _mm_storeu_si128((__m128i *)dst, res); -} - -static void aom_highbd_filter_block1d8_h8_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[8], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - src_ptr -= 3; - do { - pack_8x2_pixels(src_ptr, src_pitch, signal); - filter_8x1_pixels(signal, ff, &res0); - filter_8x1_pixels(&signal[4], ff, &res1); - store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - height -= 2; - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - } while (height > 1); - - if (height > 0) { - pack_8x1_pixels(src_ptr, signal); - filter_8x1_pixels(signal, ff, &res0); - store_8x1_avg_pixels(&res0, &max, dst_ptr); - } -} - -static void aom_highbd_filter_block1d16_h8_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[8], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - src_ptr -= 3; - do { - pack_16x1_pixels(src_ptr, signal); - filter_8x1_pixels(signal, ff, &res0); - filter_8x1_pixels(&signal[4], ff, &res1); - store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); - height -= 1; - src_ptr += src_pitch; - dst_ptr += dst_pitch; - } while (height > 0); -} - -static void aom_highbd_filter_block1d8_v8_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[9], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - pack_8x9_init(src_ptr, src_pitch, signal); - - do { - pack_8x9_pixels(src_ptr, src_pitch, signal); - - filter_8x9_pixels(signal, ff, &res0, &res1); - store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - update_pixels(signal); - - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - height -= 2; - } while (height > 0); -} - -static void aom_highbd_filter_block1d16_v8_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[17], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff[4]; - pack_filters(filter, ff); - - pack_16x9_init(src_ptr, src_pitch, signal); - - do { - pack_16x9_pixels(src_ptr, src_pitch, signal); - filter_16x9_pixels(signal, ff, &res0, &res1); - store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - update_16x9_pixels(signal); - - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - height -= 2; - } while (height > 0); -} - -static void aom_highbd_filter_block1d8_h2_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[2], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff; - pack_2t_filter(filter, &ff); - - src_ptr -= 3; - do { - pack_8x2_2t_pixels(src_ptr, src_pitch, signal); - filter_16_2t_pixels(signal, &ff, &res0, &res1); - store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); - height -= 2; - src_ptr += src_pitch << 1; - dst_ptr += dst_pitch << 1; - } while (height > 1); - - if (height > 0) { - pack_8x1_2t_pixels(src_ptr, signal); - filter_8x1_2t_pixels(signal, &ff, &res0); - store_8x1_avg_pixels(&res0, &max, dst_ptr); - } -} - -static void aom_highbd_filter_block1d16_h2_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[2], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - - __m256i ff; - pack_2t_filter(filter, &ff); - - src_ptr -= 3; - do { - pack_16x1_2t_pixels(src_ptr, signal); - filter_16_2t_pixels(signal, &ff, &res0, &res1); - store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); - height -= 1; - src_ptr += src_pitch; - dst_ptr += dst_pitch; - } while (height > 0); -} - -static void aom_highbd_filter_block1d16_v2_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m256i signal[3], res0, res1; - const __m256i max = _mm256_set1_epi16((1 << bd) - 1); - __m256i ff; - - pack_2t_filter(filter, &ff); - pack_16x2_init(src_ptr, signal); - - do { - pack_16x2_2t_pixels(src_ptr, src_pitch, signal); - filter_16x2_2t_pixels(signal, &ff, &res0, &res1); - store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); - - src_ptr += src_pitch; - dst_ptr += dst_pitch; - height -= 1; - } while (height > 0); -} - -static void aom_highbd_filter_block1d8_v2_avg_avx2( - const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, - ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { - __m128i signal[3], res0, res1; - const __m128i max = _mm_set1_epi16((1 << bd) - 1); - __m128i ff; - - pack_8x1_2t_filter(filter, &ff); - pack_8x2_init(src_ptr, signal); - - do { - pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); - filter_8_2t_pixels(signal, &ff, &res0, &res1); - store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr); - - src_ptr += src_pitch; - dst_ptr += dst_pitch; - height -= 1; - } while (height > 0); -} - void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, ptrdiff_t, uint32_t, const int16_t *, int); @@ -1080,32 +994,5 @@ void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); -HIGH_FUN_CONV_2D(, avx2); - -void aom_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void aom_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void aom_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void aom_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -#define aom_highbd_filter_block1d4_h8_avg_avx2 \ - aom_highbd_filter_block1d4_h8_avg_sse2 -#define aom_highbd_filter_block1d4_h2_avg_avx2 \ - aom_highbd_filter_block1d4_h2_avg_sse2 -#define aom_highbd_filter_block1d4_v8_avg_avx2 \ - aom_highbd_filter_block1d4_v8_avg_sse2 -#define aom_highbd_filter_block1d4_v2_avg_avx2 \ - aom_highbd_filter_block1d4_v2_avg_sse2 - -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - avx2); -HIGH_FUN_CONV_2D(avg_, avx2); #undef HIGHBD_FUNC diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c new file mode 100644 index 000000000..f7ac9b496 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/convolve_sse2.h" + +void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, + const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_vert * src_stride; + (void)filter_params_x; + (void)subpel_x_q4; + (void)conv_params; + + assert(conv_params->round_0 <= FILTER_BITS); + assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || + ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); + + __m128i s[16], coeffs_y[4]; + + const int bits = FILTER_BITS; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + + for (j = 0; j < w; j += 8) { + const uint16_t *data = &src_ptr[j]; + /* Vertical filter */ + { + __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + + s[0] = _mm_unpacklo_epi16(s0, s1); + s[1] = _mm_unpacklo_epi16(s2, s3); + s[2] = _mm_unpacklo_epi16(s4, s5); + + s[4] = _mm_unpackhi_epi16(s0, s1); + s[5] = _mm_unpackhi_epi16(s2, s3); + s[6] = _mm_unpackhi_epi16(s4, s5); + + s[0 + 8] = _mm_unpacklo_epi16(s1, s2); + s[1 + 8] = _mm_unpacklo_epi16(s3, s4); + s[2 + 8] = _mm_unpacklo_epi16(s5, s6); + + s[4 + 8] = _mm_unpackhi_epi16(s1, s2); + s[5 + 8] = _mm_unpackhi_epi16(s3, s4); + s[6 + 8] = _mm_unpackhi_epi16(s5, s6); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + + __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride)); + __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride)); + + s[3] = _mm_unpacklo_epi16(s6, s7); + s[7] = _mm_unpackhi_epi16(s6, s7); + + s[3 + 8] = _mm_unpacklo_epi16(s7, s8); + s[7 + 8] = _mm_unpackhi_epi16(s7, s8); + + const __m128i res_a0 = convolve(s, coeffs_y); + __m128i res_a_round0 = _mm_sra_epi32( + _mm_add_epi32(res_a0, round_const_bits), round_shift_bits); + + const __m128i res_a1 = convolve(s + 8, coeffs_y); + __m128i res_a_round1 = _mm_sra_epi32( + _mm_add_epi32(res_a1, round_const_bits), round_shift_bits); + + if (w - j > 4) { + const __m128i res_b0 = convolve(s + 4, coeffs_y); + __m128i res_b_round0 = _mm_sra_epi32( + _mm_add_epi32(res_b0, round_const_bits), round_shift_bits); + + const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y); + __m128i res_b_round1 = _mm_sra_epi32( + _mm_add_epi32(res_b1, round_const_bits), round_shift_bits); + + __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0); + res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel); + res_16bit0 = _mm_max_epi16(res_16bit0, zero); + + __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1); + res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel); + res_16bit1 = _mm_max_epi16(res_16bit1, zero); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_16bit1); + } else if (w == 4) { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_a_round1); + } else { + res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0); + res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel); + res_a_round0 = _mm_max_epi16(res_a_round0, zero); + + res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1); + res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel); + res_a_round1 = _mm_max_epi16(res_a_round1, zero); + + *((uint32_t *)(&dst[i * dst_stride + j])) = + _mm_cvtsi128_si32(res_a_round0); + + *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) = + _mm_cvtsi128_si32(res_a_round1); + } + + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + + s[0 + 8] = s[1 + 8]; + s[1 + 8] = s[2 + 8]; + s[2 + 8] = s[3 + 8]; + + s[4 + 8] = s[5 + 8]; + s[5 + 8] = s[6 + 8]; + s[6 + 8] = s[7 + 8]; + + s6 = s8; + } + } + } +} + +void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + InterpFilterParams *filter_params_x, + InterpFilterParams *filter_params_y, + const int subpel_x_q4, + const int subpel_y_q4, + ConvolveParams *conv_params, int bd) { + int i, j; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint16_t *const src_ptr = src - fo_horiz; + (void)subpel_y_q4; + (void)filter_params_y; + + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + + __m128i s[4], coeffs_x[4]; + + const __m128i round_const_x = + _mm_set1_epi32(((1 << conv_params->round_0) >> 1)); + const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0); + + const int bits = FILTER_BITS - conv_params->round_0; + + const __m128i round_shift_bits = _mm_cvtsi32_si128(bits); + const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1); + const __m128i clip_pixel = + _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); + const __m128i zero = _mm_setzero_si128(); + + prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x); + + for (j = 0; j < w; j += 8) { + /* Horizontal filter */ + { + for (i = 0; i < h; i += 1) { + const __m128i row00 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); + const __m128i row01 = + _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]); + + // even pixels + s[0] = _mm_alignr_epi8(row01, row00, 0); + s[1] = _mm_alignr_epi8(row01, row00, 4); + s[2] = _mm_alignr_epi8(row01, row00, 8); + s[3] = _mm_alignr_epi8(row01, row00, 12); + + __m128i res_even = convolve(s, coeffs_x); + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), + round_shift_x); + + // odd pixels + s[0] = _mm_alignr_epi8(row01, row00, 2); + s[1] = _mm_alignr_epi8(row01, row00, 6); + s[2] = _mm_alignr_epi8(row01, row00, 10); + s[3] = _mm_alignr_epi8(row01, row00, 14); + + __m128i res_odd = convolve(s, coeffs_x); + res_odd = + _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x); + + res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits), + round_shift_bits); + res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits), + round_shift_bits); + + __m128i res_even1 = _mm_packs_epi32(res_even, res_even); + __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd); + __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1); + + res = _mm_min_epi16(res, clip_pixel); + res = _mm_max_epi16(res, zero); + + if (w - j > 4) { + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } else if (w == 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res); + } else { + *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res); + } + } + } + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c deleted file mode 100644 index e001a1d70..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "aom_ports/msvc.h" -#include "./aom_dsp_rtcd.h" - -// ----------------------------------------------------------------------------- -// D45E_PRED -/* -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -*/ -static INLINE __m256i avg3_epu16(const __m256i *x, const __m256i *y, - const __m256i *z) { - const __m256i one = _mm256_set1_epi16(1); - const __m256i a = _mm256_avg_epu16(*x, *z); - const __m256i b = - _mm256_subs_epu16(a, _mm256_and_si256(_mm256_xor_si256(*x, *z), one)); - return _mm256_avg_epu16(b, *y); -} - -static INLINE void d45e_w16(const __m256i *a0, const __m256i *a1, - const __m256i *a2, uint16_t **dst, - ptrdiff_t stride) { - const __m256i y = avg3_epu16(a0, a1, a2); - _mm256_storeu_si256((__m256i *)*dst, y); - *dst += stride; -} - -void aom_highbd_d45e_predictor_16x8_avx2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m256i x0 = _mm256_loadu_si256((const __m256i *)above); - __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); - __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); - - d45e_w16(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x1, &x2, &x0, &dst, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x2, &x0, &x1, &dst, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x0, &x1, &x2, &dst, stride); - } while (i < 9); - - x0 = _mm256_loadu_si256((const __m256i *)(above + 9)); - x0 = _mm256_insert_epi16(x0, above[23], 15); - const __m256i y = avg3_epu16(&x1, &x2, &x0); - _mm256_storeu_si256((__m256i *)dst, y); -} - -void aom_highbd_d45e_predictor_16x16_avx2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m256i x0 = _mm256_loadu_si256((const __m256i *)above); - __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); - __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); - - d45e_w16(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x1, &x2, &x0, &dst, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x2, &x0, &x1, &dst, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x0, &x1, &x2, &dst, stride); - } while (i < 15); - - x0 = _mm256_loadu_si256((const __m256i *)(above + 15)); - d45e_w16(&x1, &x2, &x0, &dst, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + 16)); - d45e_w16(&x2, &x0, &x1, &dst, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + 17)); - x2 = _mm256_insert_epi16(x2, above[31], 15); - const __m256i y = avg3_epu16(&x0, &x1, &x2); - _mm256_storeu_si256((__m256i *)dst, y); -} - -void aom_highbd_d45e_predictor_16x32_avx2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m256i x0 = _mm256_loadu_si256((const __m256i *)above); - __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); - __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); - - d45e_w16(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x1, &x2, &x0, &dst, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x2, &x0, &x1, &dst, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + i++)); - d45e_w16(&x0, &x1, &x2, &dst, stride); - } while (i < 33); - - x0 = _mm256_loadu_si256((const __m256i *)(above + 33)); - x0 = _mm256_insert_epi16(x0, above[47], 15); - const __m256i y = avg3_epu16(&x1, &x2, &x0); - _mm256_storeu_si256((__m256i *)dst, y); -} - -void aom_highbd_d45e_predictor_32x16_avx2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m256i x0 = _mm256_loadu_si256((const __m256i *)above); - __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); - __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); - __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16)); - __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17)); - __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18)); - - uint16_t *dst1 = dst; - uint16_t *dst2 = dst + 16; - - d45e_w16(&x0, &x1, &x2, &dst1, stride); - d45e_w16(&y0, &y1, &y2, &dst2, stride); - - int i = 3; - do { - x0 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x1, &x2, &x0, &dst1, stride); - y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y1, &y2, &y0, &dst2, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x2, &x0, &x1, &dst1, stride); - y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y2, &y0, &y1, &dst2, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x0, &x1, &x2, &dst1, stride); - y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y0, &y1, &y2, &dst2, stride); - } while (i < 15); - - x0 = _mm256_loadu_si256((const __m256i *)(above + 15)); - d45e_w16(&x1, &x2, &x0, &dst1, stride); - y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 15)); - d45e_w16(&y1, &y2, &y0, &dst2, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + 16)); - d45e_w16(&x2, &x0, &x1, &dst1, stride); - y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + 16)); - d45e_w16(&y2, &y0, &y1, &dst2, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + 17)); - __m256i u = avg3_epu16(&x0, &x1, &x2); - _mm256_storeu_si256((__m256i *)dst1, u); - - y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + 17)); - y2 = _mm256_insert_epi16(y2, above[47], 15); - u = avg3_epu16(&y0, &y1, &y2); - _mm256_storeu_si256((__m256i *)dst2, u); -} - -void aom_highbd_d45e_predictor_32x32_avx2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m256i x0 = _mm256_loadu_si256((const __m256i *)above); - __m256i x1 = _mm256_loadu_si256((const __m256i *)(above + 1)); - __m256i x2 = _mm256_loadu_si256((const __m256i *)(above + 2)); - __m256i y0 = _mm256_loadu_si256((const __m256i *)(above + 16)); - __m256i y1 = _mm256_loadu_si256((const __m256i *)(above + 17)); - __m256i y2 = _mm256_loadu_si256((const __m256i *)(above + 18)); - - uint16_t *dst1 = dst; - uint16_t *dst2 = dst + 16; - - d45e_w16(&x0, &x1, &x2, &dst1, stride); - d45e_w16(&y0, &y1, &y2, &dst2, stride); - - int i = 3; - do { - x0 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x1, &x2, &x0, &dst1, stride); - y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y1, &y2, &y0, &dst2, stride); - - x1 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x2, &x0, &x1, &dst1, stride); - y1 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y2, &y0, &y1, &dst2, stride); - - x2 = _mm256_loadu_si256((const __m256i *)(above + i)); - d45e_w16(&x0, &x1, &x2, &dst1, stride); - y2 = _mm256_loadu_si256((const __m256i *)(above + 16 + i++)); - d45e_w16(&y0, &y1, &y2, &dst2, stride); - } while (i < 33); - - x0 = _mm256_loadu_si256((const __m256i *)(above + 33)); - __m256i u = avg3_epu16(&x1, &x2, &x0); - _mm256_storeu_si256((__m256i *)dst1, u); - - y0 = _mm256_loadu_si256((const __m256i *)(above + 16 + 33)); - y0 = _mm256_insert_epi16(y0, above[63], 15); - u = avg3_epu16(&y1, &y2, &y0); - _mm256_storeu_si256((__m256i *)dst2, u); -} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c index 691e166cf..5a55736c4 100644 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c @@ -11,7 +11,7 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" // ----------------------------------------------------------------------------- // H_PRED @@ -982,275 +982,3 @@ void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride, dst += stride; } } - -// ----------------------------------------------------------------------------- -/* -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -*/ -static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, - const __m128i *z) { - const __m128i one = _mm_set1_epi16(1); - const __m128i a = _mm_avg_epu16(*x, *z); - const __m128i b = - _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); - return _mm_avg_epu16(b, *y); -} - -void aom_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); - const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); - const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); - const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); - const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); - const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4); - const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0); - const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00); - const __m128i row0 = _mm_srli_si128(avg2, 6); - const __m128i row1 = _mm_srli_si128(avg3, 4); - const __m128i row2 = _mm_srli_si128(avg2, 4); - const __m128i row3 = _mm_srli_si128(avg3, 2); - (void)bd; - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); - - dst -= stride; - dst[0] = _mm_extract_epi16(avg3, 1); - dst[stride] = _mm_extract_epi16(avg3, 0); -} - -void aom_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4)); - const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0); - const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1); - const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2); - const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2); - const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0); - const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC); - const __m128i row0 = _mm_srli_si128(avg3, 6); - const __m128i row1 = _mm_srli_si128(avg3, 4); - const __m128i row2 = _mm_srli_si128(avg3, 2); - const __m128i row3 = avg3; - (void)bd; - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -void aom_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5)); - const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0); - const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1); - const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2); - const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3); - const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2); - const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4); - const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00); - const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0); - const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3); - const __m128i row2 = _mm_srli_si128(row3, 4); - const __m128i row1 = _mm_srli_si128(row3, 8); - const __m128i row0 = _mm_srli_si128(avg3, 4); - (void)bd; - _mm_storel_epi64((__m128i *)dst, row0); - dst[0] = _mm_extract_epi16(avg2, 3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -void aom_highbd_d45e_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); - const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); - __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); - CDEFGH00 = _mm_insert_epi16(CDEFGH00, above[7], 6); - const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); - (void)left; - (void)bd; - _mm_storel_epi64((__m128i *)dst, avg3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); -} - -void aom_highbd_d45e_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i h76543210 = _mm_load_si128((const __m128i *)above); - __m128i hx7654321 = _mm_srli_si128(h76543210, 2); - __m128i h87654321 = _mm_insert_epi16(hx7654321, above[8], 7); - __m128i hx8765432 = _mm_srli_si128(h87654321, 2); - __m128i h98765432 = _mm_insert_epi16(hx8765432, above[9], 7); - __m128i avg3 = avg3_epu16(&h76543210, &h87654321, &h98765432); - _mm_storel_epi64((__m128i *)dst, avg3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 8)); - dst += stride; - - // hcba98765 - h76543210 = _mm_loadu_si128((const __m128i *)((above + 5))); - h76543210 = _mm_insert_epi16(h76543210, above[11], 7); - // hxcba9876 - hx7654321 = _mm_srli_si128(h76543210, 2); - // hxxcba987 - hx8765432 = _mm_srli_si128(h76543210, 4); - avg3 = avg3_epu16(&h76543210, &hx7654321, &hx8765432); - _mm_storel_epi64((__m128i *)dst, avg3); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); - dst += stride; - _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); -} - -void aom_highbd_d45e_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i x0 = _mm_load_si128((const __m128i *)above); - __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); - __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); - __m128i y = avg3_epu16(&x0, &x1, &x2); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x0 = _mm_loadu_si128((const __m128i *)(above + 3)); - y = avg3_epu16(&x1, &x2, &x0); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x1 = _mm_loadu_si128((const __m128i *)(above + 4)); - y = avg3_epu16(&x2, &x0, &x1); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x2 = _mm_loadu_si128((const __m128i *)(above + 5)); - x2 = _mm_insert_epi16(x2, above[11], 7); - y = avg3_epu16(&x0, &x1, &x2); - _mm_store_si128((__m128i *)dst, y); -} - -static INLINE void d45e_w8(const __m128i *a0, const __m128i *a1, - const __m128i *a2, uint16_t **dst, - ptrdiff_t stride) { - const __m128i y = avg3_epu16(a0, a1, a2); - _mm_storeu_si128((__m128i *)*dst, y); - *dst += stride; -} - -void aom_highbd_d45e_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i x0 = _mm_load_si128((const __m128i *)above); - __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); - __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); - - d45e_w8(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x1, &x2, &x0, &dst, stride); - - x1 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x2, &x0, &x1, &dst, stride); - - x2 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x0, &x1, &x2, &dst, stride); - } while (i < 9); - - x0 = _mm_loadu_si128((const __m128i *)(above + 9)); - x0 = _mm_insert_epi16(x0, above[15], 7); - const __m128i y = avg3_epu16(&x1, &x2, &x0); - _mm_store_si128((__m128i *)dst, y); -} - -void aom_highbd_d45e_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - (void)left; - (void)bd; - __m128i x0 = _mm_load_si128((const __m128i *)above); - __m128i x1 = _mm_loadu_si128((const __m128i *)(above + 1)); - __m128i x2 = _mm_loadu_si128((const __m128i *)(above + 2)); - - d45e_w8(&x0, &x1, &x2, &dst, stride); - - int i = 3; - do { - x0 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x1, &x2, &x0, &dst, stride); - - x1 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x2, &x0, &x1, &dst, stride); - - x2 = _mm_loadu_si128((const __m128i *)(above + i++)); - d45e_w8(&x0, &x1, &x2, &dst, stride); - } while (i < 15); - - x0 = _mm_loadu_si128((const __m128i *)(above + 15)); - __m128i y = avg3_epu16(&x1, &x2, &x0); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x1 = _mm_loadu_si128((const __m128i *)(above + 16)); - y = avg3_epu16(&x2, &x0, &x1); - _mm_store_si128((__m128i *)dst, y); - dst += stride; - - x2 = _mm_loadu_si128((const __m128i *)(above + 17)); - x2 = _mm_insert_epi16(x2, above[23], 7); - y = avg3_epu16(&x0, &x1, &x2); - _mm_store_si128((__m128i *)dst, y); -} diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c deleted file mode 100644 index b089a3f43..000000000 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_ssse3.c +++ /dev/null @@ -1,521 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_dsp_rtcd.h" - -// ----------------------------------------------------------------------------- -/* -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -*/ -static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, - const __m128i *z) { - const __m128i one = _mm_set1_epi16(1); - const __m128i a = _mm_avg_epu16(*x, *z); - const __m128i b = - _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); - return _mm_avg_epu16(b, *y); -} - -DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = { - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1 -}; - -static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) { - *a = _mm_shuffle_epi8(*a, *rotrw); - return *a; -} - -void aom_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); - const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); - const __m128i IXABCDEF = - _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14); - const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF); - const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG); - const __m128i XIJKLMNO = - _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); - const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2); - __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0); - __m128i rowa = avg2; - __m128i rowb = avg3; - int i; - (void)bd; - for (i = 0; i < 8; i += 2) { - _mm_store_si128((__m128i *)dst, rowa); - dst += stride; - _mm_store_si128((__m128i *)dst, rowb); - dst += stride; - rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); - rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14); - } -} - -void aom_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i A0 = _mm_load_si128((const __m128i *)above); - const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); - const __m128i avg2_0 = _mm_avg_epu16(A0, B0); - const __m128i avg2_1 = _mm_avg_epu16(A1, B1); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); - const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); - const __m128i L1_ = _mm_srli_si128(L1, 2); - __m128i rowa_0 = avg2_0; - __m128i rowa_1 = avg2_1; - __m128i rowb_0 = avg3_0; - __m128i rowb_1 = avg3_1; - __m128i avg3_left[2]; - int i, j; - (void)bd; - avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); - avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); - for (i = 0; i < 2; ++i) { - __m128i avg_left = avg3_left[i]; - for (j = 0; j < 8; j += 2) { - _mm_store_si128((__m128i *)dst, rowa_0); - _mm_store_si128((__m128i *)(dst + 8), rowa_1); - dst += stride; - _mm_store_si128((__m128i *)dst, rowb_0); - _mm_store_si128((__m128i *)(dst + 8), rowb_1); - dst += stride; - rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); - rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); - rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); - rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); - } - } -} - -void aom_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i A0 = _mm_load_si128((const __m128i *)above); - const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8)); - const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24)); - const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15)); - const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23)); - const __m128i avg2_0 = _mm_avg_epu16(A0, B0); - const __m128i avg2_1 = _mm_avg_epu16(A1, B1); - const __m128i avg2_2 = _mm_avg_epu16(A2, B2); - const __m128i avg2_3 = _mm_avg_epu16(A3, B3); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); - const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); - const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14); - const __m128i C1 = _mm_alignr_epi8(B1, B0, 14); - const __m128i C2 = _mm_alignr_epi8(B2, B1, 14); - const __m128i C3 = _mm_alignr_epi8(B3, B2, 14); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); - const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); - const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); - const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2); - const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2); - const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2); - const __m128i L3_ = _mm_srli_si128(L3, 2); - __m128i rowa_0 = avg2_0; - __m128i rowa_1 = avg2_1; - __m128i rowa_2 = avg2_2; - __m128i rowa_3 = avg2_3; - __m128i rowb_0 = avg3_0; - __m128i rowb_1 = avg3_1; - __m128i rowb_2 = avg3_2; - __m128i rowb_3 = avg3_3; - __m128i avg3_left[4]; - int i, j; - (void)bd; - avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_); - avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_); - avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_); - avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_); - for (i = 0; i < 4; ++i) { - __m128i avg_left = avg3_left[i]; - for (j = 0; j < 8; j += 2) { - _mm_store_si128((__m128i *)dst, rowa_0); - _mm_store_si128((__m128i *)(dst + 8), rowa_1); - _mm_store_si128((__m128i *)(dst + 16), rowa_2); - _mm_store_si128((__m128i *)(dst + 24), rowa_3); - dst += stride; - _mm_store_si128((__m128i *)dst, rowb_0); - _mm_store_si128((__m128i *)(dst + 8), rowb_1); - _mm_store_si128((__m128i *)(dst + 16), rowb_2); - _mm_store_si128((__m128i *)(dst + 24), rowb_3); - dst += stride; - rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); - rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); - rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); - rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); - rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14); - rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14); - rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14); - rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14); - } - } -} - -void aom_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); - const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); - const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); - const __m128i XIJKLMNO = - _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); - const __m128i AXIJKLMN = - _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); - const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); - __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); - __m128i rowa = avg3; - int i; - (void)bd; - for (i = 0; i < 8; ++i) { - rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); - _mm_store_si128((__m128i *)dst, rowa); - dst += stride; - } -} - -void aom_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i B0 = _mm_load_si128((const __m128i *)above); - const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); - const __m128i C1 = _mm_srli_si128(B1, 2); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); - const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); - __m128i rowa_0 = avg3_0; - __m128i rowa_1 = avg3_1; - __m128i avg3_left[2]; - int i, j; - (void)bd; - avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); - avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); - for (i = 0; i < 2; ++i) { - __m128i avg_left = avg3_left[i]; - for (j = 0; j < 8; ++j) { - rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); - rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); - _mm_store_si128((__m128i *)dst, rowa_0); - _mm_store_si128((__m128i *)(dst + 8), rowa_1); - dst += stride; - } - } -} - -void aom_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); - const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); - const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); - const __m128i B0 = _mm_load_si128((const __m128i *)above); - const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); - const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16)); - const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24)); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); - const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); - const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); - const __m128i C1 = _mm_alignr_epi8(B2, B1, 2); - const __m128i C2 = _mm_alignr_epi8(B3, B2, 2); - const __m128i C3 = _mm_srli_si128(B3, 2); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); - const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); - const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); - const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); - const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); - const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14); - const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14); - __m128i rowa_0 = avg3_0; - __m128i rowa_1 = avg3_1; - __m128i rowa_2 = avg3_2; - __m128i rowa_3 = avg3_3; - __m128i avg3_left[4]; - int i, j; - (void)bd; - avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); - avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); - avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_); - avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_); - for (i = 0; i < 4; ++i) { - __m128i avg_left = avg3_left[i]; - for (j = 0; j < 8; ++j) { - rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14); - rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14); - rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); - rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); - _mm_store_si128((__m128i *)dst, rowa_0); - _mm_store_si128((__m128i *)(dst + 8), rowa_1); - _mm_store_si128((__m128i *)(dst + 16), rowa_2); - _mm_store_si128((__m128i *)(dst + 24), rowa_3); - dst += stride; - } - } -} - -void aom_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2); - const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4); - const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG); - const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); - const __m128i XIJKLMNO = - _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); - const __m128i AXIJKLMN = - _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14); - const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); - const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO); - const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left); - const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left); - const __m128i row0 = - _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12); - const __m128i row1 = - _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12); - const __m128i row2 = - _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12); - const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12); - const __m128i row4 = - _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12); - const __m128i row5 = - _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12); - const __m128i row6 = - _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12); - const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12); - (void)bd; - _mm_store_si128((__m128i *)dst, row0); - dst += stride; - _mm_store_si128((__m128i *)dst, row1); - dst += stride; - _mm_store_si128((__m128i *)dst, row2); - dst += stride; - _mm_store_si128((__m128i *)dst, row3); - dst += stride; - _mm_store_si128((__m128i *)dst, row4); - dst += stride; - _mm_store_si128((__m128i *)dst, row5); - dst += stride; - _mm_store_si128((__m128i *)dst, row6); - dst += stride; - _mm_store_si128((__m128i *)dst, row7); -} - -void aom_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); - const __m128i B1 = _mm_srli_si128(A1, 2); - const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); - const __m128i C1 = _mm_srli_si128(A1, 4); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); - const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); - const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); - const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); - const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); - const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); - __m128i row_0 = avg3_0; - __m128i row_1 = avg3_1; - __m128i avg2_avg3_left[2][2]; - int i, j; - (void)bd; - - avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); - avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); - avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); - avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); - - for (j = 0; j < 2; ++j) { - for (i = 0; i < 2; ++i) { - const __m128i avg2_avg3 = avg2_avg3_left[j][i]; - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - dst += stride; - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - dst += stride; - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - dst += stride; - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - dst += stride; - } - } -} - -void aom_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, - const uint16_t *above, - const uint16_t *left, int bd) { - const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); - const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); - const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15)); - const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23)); - const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); - const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); - const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); - const __m128i B3 = _mm_srli_si128(A3, 2); - const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); - const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); - const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); - const __m128i C3 = _mm_srli_si128(A3, 4); - const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); - const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); - const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); - const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); - const __m128i L0 = _mm_load_si128((const __m128i *)left); - const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); - const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16)); - const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24)); - const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); - const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); - const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14); - const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14); - const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); - const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); - const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12); - const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12); - const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); - const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); - const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2); - const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3); - const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); - const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); - const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2); - const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3); - __m128i row_0 = avg3_0; - __m128i row_1 = avg3_1; - __m128i row_2 = avg3_2; - __m128i row_3 = avg3_3; - __m128i avg2_avg3_left[4][2]; - int i, j; - (void)bd; - - avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); - avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); - avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); - avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); - avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2); - avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2); - avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3); - avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3); - - for (j = 0; j < 4; ++j) { - for (i = 0; i < 2; ++i) { - const __m128i avg2_avg3 = avg2_avg3_left[j][i]; - row_3 = _mm_alignr_epi8(row_3, row_2, 12); - row_2 = _mm_alignr_epi8(row_2, row_1, 12); - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - _mm_store_si128((__m128i *)(dst + 16), row_2); - _mm_store_si128((__m128i *)(dst + 24), row_3); - dst += stride; - row_3 = _mm_alignr_epi8(row_3, row_2, 12); - row_2 = _mm_alignr_epi8(row_2, row_1, 12); - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - _mm_store_si128((__m128i *)(dst + 16), row_2); - _mm_store_si128((__m128i *)(dst + 24), row_3); - dst += stride; - row_3 = _mm_alignr_epi8(row_3, row_2, 12); - row_2 = _mm_alignr_epi8(row_2, row_1, 12); - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - _mm_store_si128((__m128i *)(dst + 16), row_2); - _mm_store_si128((__m128i *)(dst + 24), row_3); - dst += stride; - row_3 = _mm_alignr_epi8(row_3, row_2, 12); - row_2 = _mm_alignr_epi8(row_2, row_1, 12); - row_1 = _mm_alignr_epi8(row_1, row_0, 12); - row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); - _mm_store_si128((__m128i *)dst, row_0); - _mm_store_si128((__m128i *)(dst + 8), row_1); - _mm_store_si128((__m128i *)(dst + 16), row_2); - _mm_store_si128((__m128i *)(dst + 24), row_3); - dst += stride; - } - } -} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c index 94c68885c..c954da94e 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c @@ -11,210 +11,26 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/x86/common_avx2.h" #include "aom_dsp/x86/lpf_common_sse2.h" #include "aom/aom_integer.h" -#if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 -static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, - const uint8_t *t, int bd, __m256i *blt, - __m256i *lt, __m256i *thr) { - const int shift = bd - 8; - const __m128i zero = _mm_setzero_si128(); - - __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero); - __m256i y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); - *blt = _mm256_slli_epi16(y, shift); - - x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero); - y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); - *lt = _mm256_slli_epi16(y, shift); - - x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); - y = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); - *thr = _mm256_slli_epi16(y, shift); -} - -static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, - __m256i *p, __m256i *q) { - int i; - for (i = 0; i < size; i++) { - p[i] = _mm256_loadu_si256((__m256i *)(s - (i + 1) * pitch)); - q[i] = _mm256_loadu_si256((__m256i *)(s + i * pitch)); - } -} - -static INLINE void highbd_hev_mask(const __m256i *p, const __m256i *q, - const __m256i *t, __m256i *hev) { - const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], p[0])); - const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q[1], q[0])); - __m256i h = _mm256_max_epi16(abs_p1p0, abs_q1q0); - h = _mm256_subs_epu16(h, *t); - - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - const __m256i zero = _mm256_setzero_si256(); - *hev = _mm256_xor_si256(_mm256_cmpeq_epi16(h, zero), ffff); -} - -static INLINE void highbd_filter_mask(const __m256i *p, const __m256i *q, - const __m256i *l, const __m256i *bl, - __m256i *mask) { - __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p[0], q[0])); - __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p[1], q[1])); - abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); - - const __m256i zero = _mm256_setzero_si256(); - const __m256i one = _mm256_set1_epi16(1); - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - __m256i max = _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), *bl); - max = _mm256_xor_si256(_mm256_cmpeq_epi16(max, zero), ffff); - max = _mm256_and_si256(max, _mm256_adds_epu16(*l, one)); - - int i; - for (i = 1; i < 4; ++i) { - max = _mm256_max_epi16(max, - _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[i - 1]))); - max = _mm256_max_epi16(max, - _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[i - 1]))); - } - max = _mm256_subs_epu16(max, *l); - *mask = _mm256_cmpeq_epi16(max, zero); // return ~mask -} - -static INLINE void flat_mask_internal(const __m256i *th, const __m256i *p, - const __m256i *q, int bd, int start, - int end, __m256i *flat) { - __m256i max = _mm256_setzero_si256(); - int i; - for (i = start; i < end; ++i) { - max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(p[i], p[0]))); - max = _mm256_max_epi16(max, _mm256_abs_epi16(_mm256_sub_epi16(q[i], q[0]))); - } - - __m256i ft; - if (bd == 8) - ft = _mm256_subs_epu16(max, *th); - else if (bd == 10) - ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 2)); - else // bd == 12 - ft = _mm256_subs_epu16(max, _mm256_slli_epi16(*th, 4)); - - const __m256i zero = _mm256_setzero_si256(); - *flat = _mm256_cmpeq_epi16(ft, zero); -} - -// Note: -// Access p[3-1], p[0], and q[3-1], q[0] -static INLINE void highbd_flat_mask4(const __m256i *th, const __m256i *p, - const __m256i *q, __m256i *flat, int bd) { - // check the distance 1,2,3 against 0 - flat_mask_internal(th, p, q, bd, 1, 4, flat); -} - -// Note: -// access p[7-4], p[0], and q[7-4], q[0] -static INLINE void highbd_flat_mask5(const __m256i *th, const __m256i *p, - const __m256i *q, __m256i *flat, int bd) { - flat_mask_internal(th, p, q, bd, 4, 8, flat); -} - -static INLINE void pixel_clamp(const __m256i *min, const __m256i *max, - __m256i *pixel) { - __m256i clamped, mask; - - mask = _mm256_cmpgt_epi16(*pixel, *max); - clamped = _mm256_andnot_si256(mask, *pixel); - mask = _mm256_and_si256(mask, *max); - clamped = _mm256_or_si256(mask, clamped); - - mask = _mm256_cmpgt_epi16(clamped, *min); - clamped = _mm256_and_si256(mask, clamped); - mask = _mm256_andnot_si256(mask, *min); - *pixel = _mm256_or_si256(clamped, mask); -} - -static INLINE void highbd_filter4(__m256i *p, __m256i *q, const __m256i *mask, - const __m256i *th, int bd, __m256i *ps, - __m256i *qs) { - __m256i t80; - if (bd == 8) - t80 = _mm256_set1_epi16(0x80); - else if (bd == 10) - t80 = _mm256_set1_epi16(0x200); - else // bd == 12 - t80 = _mm256_set1_epi16(0x800); - - __m256i ps0 = _mm256_subs_epi16(p[0], t80); - __m256i ps1 = _mm256_subs_epi16(p[1], t80); - __m256i qs0 = _mm256_subs_epi16(q[0], t80); - __m256i qs1 = _mm256_subs_epi16(q[1], t80); - - const __m256i one = _mm256_set1_epi16(1); - const __m256i pmax = _mm256_subs_epi16( - _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); - const __m256i zero = _mm256_setzero_si256(); - const __m256i pmin = _mm256_subs_epi16(zero, t80); - - __m256i filter = _mm256_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filter); - - __m256i hev; - highbd_hev_mask(p, q, th, &hev); - filter = _mm256_and_si256(filter, hev); - - const __m256i x = _mm256_subs_epi16(qs0, ps0); - filter = _mm256_adds_epi16(filter, x); - filter = _mm256_adds_epi16(filter, x); - filter = _mm256_adds_epi16(filter, x); - pixel_clamp(&pmin, &pmax, &filter); - filter = _mm256_and_si256(filter, *mask); - - const __m256i t3 = _mm256_set1_epi16(3); - const __m256i t4 = _mm256_set1_epi16(4); - - __m256i filter1 = _mm256_adds_epi16(filter, t4); - __m256i filter2 = _mm256_adds_epi16(filter, t3); - pixel_clamp(&pmin, &pmax, &filter1); - pixel_clamp(&pmin, &pmax, &filter2); - filter1 = _mm256_srai_epi16(filter1, 3); - filter2 = _mm256_srai_epi16(filter2, 3); - - qs0 = _mm256_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &qs0); - ps0 = _mm256_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &ps0); - - qs[0] = _mm256_adds_epi16(qs0, t80); - ps[0] = _mm256_adds_epi16(ps0, t80); - - filter = _mm256_adds_epi16(filter1, one); - filter = _mm256_srai_epi16(filter, 1); - filter = _mm256_andnot_si256(hev, filter); - - qs1 = _mm256_subs_epi16(qs1, filter); - pixel_clamp(&pmin, &pmax, &qs1); - ps1 = _mm256_adds_epi16(ps1, filter); - pixel_clamp(&pmin, &pmax, &ps1); - - qs[1] = _mm256_adds_epi16(qs1, t80); - ps[1] = _mm256_adds_epi16(ps1, t80); -} -#endif // #if !CONFIG_PARALLEL_DEBLOCKING || !CONFIG_CB4X4 - -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 -void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int p, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd) { - aom_highbd_lpf_horizontal_edge_16_sse2(s, p, blt, lt, thr, bd); +void aom_highbd_lpf_horizontal_14_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0, + blimit1, limit1, thresh1, bd); } -void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, - const uint8_t *blt, const uint8_t *lt, - const uint8_t *thr, int bd) { - aom_highbd_lpf_vertical_16_dual_sse2(s, p, blt, lt, thr, bd); +void aom_highbd_lpf_vertical_14_dual_avx2( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); } void aom_highbd_lpf_horizontal_4_dual_avx2( @@ -248,626 +64,3 @@ void aom_highbd_lpf_vertical_8_dual_avx2( aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1, limit1, thresh1, bd); } -#else -void aom_highbd_lpf_horizontal_edge_16_avx2(uint16_t *s, int pitch, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd) { - __m256i blimit, limit, thresh; - get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); - - __m256i p[8], q[8]; - load_highbd_pixel(s, 8, pitch, p, q); - - __m256i mask; - highbd_filter_mask(p, q, &limit, &blimit, &mask); - - __m256i flat, flat2; - const __m256i one = _mm256_set1_epi16(1); - highbd_flat_mask4(&one, p, q, &flat, bd); - highbd_flat_mask5(&one, p, q, &flat2, bd); - - flat = _mm256_and_si256(flat, mask); - flat2 = _mm256_and_si256(flat2, flat); - - __m256i ps[2], qs[2]; - highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); - - // flat and wide flat calculations - __m256i flat_p[3], flat_q[3]; - __m256i flat2_p[7], flat2_q[7]; - { - const __m256i eight = _mm256_set1_epi16(8); - const __m256i four = _mm256_set1_epi16(4); - - __m256i sum_p = _mm256_add_epi16(_mm256_add_epi16(p[6], p[5]), - _mm256_add_epi16(p[4], p[3])); - __m256i sum_q = _mm256_add_epi16(_mm256_add_epi16(q[6], q[5]), - _mm256_add_epi16(q[4], q[3])); - - __m256i sum_lp = _mm256_add_epi16(p[0], _mm256_add_epi16(p[2], p[1])); - sum_p = _mm256_add_epi16(sum_p, sum_lp); - - __m256i sum_lq = _mm256_add_epi16(q[0], _mm256_add_epi16(q[2], q[1])); - sum_q = _mm256_add_epi16(sum_q, sum_lq); - sum_p = _mm256_add_epi16(eight, _mm256_add_epi16(sum_p, sum_q)); - sum_lp = _mm256_add_epi16(four, _mm256_add_epi16(sum_lp, sum_lq)); - - flat2_p[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(p[7], p[0])), 4); - flat2_q[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(q[7], q[0])), 4); - flat_p[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(p[3], p[0])), 3); - flat_q[0] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(q[3], q[0])), 3); - - __m256i sum_p7 = _mm256_add_epi16(p[7], p[7]); - __m256i sum_q7 = _mm256_add_epi16(q[7], q[7]); - __m256i sum_p3 = _mm256_add_epi16(p[3], p[3]); - __m256i sum_q3 = _mm256_add_epi16(q[3], q[3]); - - sum_q = _mm256_sub_epi16(sum_p, p[6]); - sum_p = _mm256_sub_epi16(sum_p, q[6]); - flat2_p[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[1])), 4); - flat2_q[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[1])), 4); - - sum_lq = _mm256_sub_epi16(sum_lp, p[2]); - sum_lp = _mm256_sub_epi16(sum_lp, q[2]); - flat_p[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[1])), 3); - flat_q[1] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[1])), 3); - - sum_p7 = _mm256_add_epi16(sum_p7, p[7]); - sum_q7 = _mm256_add_epi16(sum_q7, q[7]); - sum_p3 = _mm256_add_epi16(sum_p3, p[3]); - sum_q3 = _mm256_add_epi16(sum_q3, q[3]); - - sum_p = _mm256_sub_epi16(sum_p, q[5]); - sum_q = _mm256_sub_epi16(sum_q, p[5]); - flat2_p[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[2])), 4); - flat2_q[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[2])), 4); - - sum_lp = _mm256_sub_epi16(sum_lp, q[1]); - sum_lq = _mm256_sub_epi16(sum_lq, p[1]); - flat_p[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lp, _mm256_add_epi16(sum_p3, p[2])), 3); - flat_q[2] = _mm256_srli_epi16( - _mm256_add_epi16(sum_lq, _mm256_add_epi16(sum_q3, q[2])), 3); - - int i; - for (i = 3; i < 7; ++i) { - sum_p7 = _mm256_add_epi16(sum_p7, p[7]); - sum_q7 = _mm256_add_epi16(sum_q7, q[7]); - sum_p = _mm256_sub_epi16(sum_p, q[7 - i]); - sum_q = _mm256_sub_epi16(sum_q, p[7 - i]); - flat2_p[i] = _mm256_srli_epi16( - _mm256_add_epi16(sum_p, _mm256_add_epi16(sum_p7, p[i])), 4); - flat2_q[i] = _mm256_srli_epi16( - _mm256_add_epi16(sum_q, _mm256_add_epi16(sum_q7, q[i])), 4); - } - } - - // highbd_filter8 - p[2] = _mm256_andnot_si256(flat, p[2]); - // p2 remains unchanged if !(flat && mask) - flat_p[2] = _mm256_and_si256(flat, flat_p[2]); - // when (flat && mask) - p[2] = _mm256_or_si256(p[2], flat_p[2]); // full list of p2 values - q[2] = _mm256_andnot_si256(flat, q[2]); - flat_q[2] = _mm256_and_si256(flat, flat_q[2]); - q[2] = _mm256_or_si256(q[2], flat_q[2]); // full list of q2 values - - int i; - for (i = 1; i >= 0; i--) { - ps[i] = _mm256_andnot_si256(flat, ps[i]); - flat_p[i] = _mm256_and_si256(flat, flat_p[i]); - p[i] = _mm256_or_si256(ps[i], flat_p[i]); - qs[i] = _mm256_andnot_si256(flat, qs[i]); - flat_q[i] = _mm256_and_si256(flat, flat_q[i]); - q[i] = _mm256_or_si256(qs[i], flat_q[i]); - } - - // highbd_filter16 - - for (i = 6; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm256_andnot_si256(flat2, p[i]); - flat2_p[i] = _mm256_and_si256(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm256_or_si256(p[i], flat2_p[i]); // full list of p values - - q[i] = _mm256_andnot_si256(flat2, q[i]); - flat2_q[i] = _mm256_and_si256(flat2, flat2_q[i]); - q[i] = _mm256_or_si256(q[i], flat2_q[i]); - _mm256_storeu_si256((__m256i *)(s - (i + 1) * pitch), p[i]); - _mm256_storeu_si256((__m256i *)(s + i * pitch), q[i]); - } -} - -static INLINE void highbd_transpose16x16(uint16_t *src, int src_p, - uint16_t *dst, int dst_p) { - __m256i x[16]; - int i; - for (i = 0; i < 16; ++i) { - x[i] = _mm256_loadu_si256((const __m256i *)src); - src += src_p; - } - mm256_transpose_16x16(x, x); - for (i = 0; i < 16; ++i) { - _mm256_storeu_si256((__m256i *)dst, x[i]); - dst += dst_p; - } -} - -void aom_highbd_lpf_vertical_16_dual_avx2(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[256]); - - // Transpose 16x16 - highbd_transpose16x16(s - 8, p, t_dst, 16); - - // Loop filtering - aom_highbd_lpf_horizontal_edge_16_avx2(t_dst + 8 * 16, 16, blimit, limit, - thresh, bd); - - // Transpose back - highbd_transpose16x16(t_dst, 16, s - 8, p); -} - -static INLINE void get_dual_limit(const uint8_t *b0, const uint8_t *l0, - const uint8_t *t0, const uint8_t *b1, - const uint8_t *l1, const uint8_t *t1, int bd, - __m256i *blt, __m256i *lt, __m256i *thr) { - const __m128i z128 = _mm_setzero_si128(); - const __m128i blimit0 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b0), z128); - const __m128i limit0 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l0), z128); - const __m128i thresh0 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t0), z128); - const __m128i blimit1 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)b1), z128); - const __m128i limit1 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l1), z128); - const __m128i thresh1 = - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t1), z128); - - *blt = _mm256_inserti128_si256(_mm256_castsi128_si256(blimit0), blimit1, 1); - *lt = _mm256_inserti128_si256(_mm256_castsi128_si256(limit0), limit1, 1); - *thr = _mm256_inserti128_si256(_mm256_castsi128_si256(thresh0), thresh1, 1); - - int shift = bd - 8; - *blt = _mm256_slli_epi16(*blt, shift); - *lt = _mm256_slli_epi16(*lt, shift); - *thr = _mm256_slli_epi16(*thr, shift); -} - -void aom_highbd_lpf_horizontal_4_dual_avx2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); - __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); - __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); - __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); - __m256i q0 = _mm256_loadu_si256((__m256i *)(s - 0 * p)); - __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); - __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); - __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); - - const __m256i abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); - const __m256i abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); - - __m256i abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); - __m256i abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); - - __m256i blimit, limit, thresh; - get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, - &blimit, &limit, &thresh); - - __m256i t80, tff80, tffe0, t1f, t7f; - if (bd == 8) { - t80 = _mm256_set1_epi16(0x80); - tff80 = _mm256_set1_epi16(0xff80); - tffe0 = _mm256_set1_epi16(0xffe0); - t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 8); - t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 8); - } else if (bd == 10) { - t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 2); - tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 2); - tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 2); - t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 6); - t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 6); - } else { // bd == 12 - t80 = _mm256_slli_epi16(_mm256_set1_epi16(0x80), 4); - tff80 = _mm256_slli_epi16(_mm256_set1_epi16(0xff80), 4); - tffe0 = _mm256_slli_epi16(_mm256_set1_epi16(0xffe0), 4); - t1f = _mm256_srli_epi16(_mm256_set1_epi16(0x1fff), 4); - t7f = _mm256_srli_epi16(_mm256_set1_epi16(0x7fff), 4); - } - - __m256i ps1 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 2 * p)), t80); - __m256i ps0 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s - 1 * p)), t80); - __m256i qs0 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 0 * p)), t80); - __m256i qs1 = - _mm256_subs_epi16(_mm256_loadu_si256((__m256i *)(s + 1 * p)), t80); - - // filter_mask and hev_mask - const __m256i zero = _mm256_setzero_si256(); - __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); - __m256i hev = _mm256_subs_epu16(flat, thresh); - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); - - abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); - __m256i mask = - _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); - mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - // So taking maximums continues to work: - const __m256i one = _mm256_set1_epi16(1); - mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); - mask = _mm256_max_epi16(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - __m256i work = _mm256_max_epi16( - _mm256_or_si256(_mm256_subs_epu16(p2, p1), _mm256_subs_epu16(p1, p2)), - _mm256_or_si256(_mm256_subs_epu16(p3, p2), _mm256_subs_epu16(p2, p3))); - mask = _mm256_max_epi16(work, mask); - work = _mm256_max_epi16( - _mm256_or_si256(_mm256_subs_epu16(q2, q1), _mm256_subs_epu16(q1, q2)), - _mm256_or_si256(_mm256_subs_epu16(q3, q2), _mm256_subs_epu16(q2, q3))); - mask = _mm256_max_epi16(work, mask); - mask = _mm256_subs_epu16(mask, limit); - mask = _mm256_cmpeq_epi16(mask, zero); - - // filter4 - const __m256i pmax = _mm256_subs_epi16( - _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); - const __m256i pmin = _mm256_subs_epi16(zero, t80); - - __m256i filt = _mm256_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm256_and_si256(filt, hev); - __m256i work_a = _mm256_subs_epi16(qs0, ps0); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - pixel_clamp(&pmin, &pmax, &filt); - - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm256_and_si256(filt, mask); - - const __m256i t4 = _mm256_set1_epi16(4); - const __m256i t3 = _mm256_set1_epi16(3); - - __m256i filter1 = _mm256_adds_epi16(filt, t4); - pixel_clamp(&pmin, &pmax, &filter1); - __m256i filter2 = _mm256_adds_epi16(filt, t3); - pixel_clamp(&pmin, &pmax, &filter2); - - // Filter1 >> 3 - work_a = _mm256_cmpgt_epi16(zero, filter1); // get the values that are <0 - filter1 = _mm256_srli_epi16(filter1, 3); - work_a = _mm256_and_si256(work_a, tffe0); // sign bits for the values < 0 - filter1 = _mm256_and_si256(filter1, t1f); // clamp the range - filter1 = _mm256_or_si256(filter1, work_a); // reinsert the sign bits - - // Filter2 >> 3 - work_a = _mm256_cmpgt_epi16(zero, filter2); - filter2 = _mm256_srli_epi16(filter2, 3); - work_a = _mm256_and_si256(work_a, tffe0); - filter2 = _mm256_and_si256(filter2, t1f); - filter2 = _mm256_or_si256(filter2, work_a); - - // filt >> 1 - // equivalent to shifting 0x1f left by bitdepth - 8 - // and setting new bits to 1 - filt = _mm256_adds_epi16(filter1, one); - work_a = _mm256_cmpgt_epi16(zero, filt); - filt = _mm256_srli_epi16(filt, 1); - work_a = _mm256_and_si256(work_a, tff80); - filt = _mm256_and_si256(filt, t7f); - filt = _mm256_or_si256(filt, work_a); - - filt = _mm256_andnot_si256(hev, filt); - - filter1 = _mm256_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &filter1); - q0 = _mm256_adds_epi16(filter1, t80); - - filter1 = _mm256_subs_epi16(qs1, filt); - pixel_clamp(&pmin, &pmax, &filter1); - q1 = _mm256_adds_epi16(filter1, t80); - - filter2 = _mm256_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &filter2); - p0 = _mm256_adds_epi16(filter2, t80); - - filter2 = _mm256_adds_epi16(ps1, filt); - pixel_clamp(&pmin, &pmax, &filter2); - p1 = _mm256_adds_epi16(filter2, t80); - - _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); - _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); - _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); - _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); -} - -void aom_highbd_lpf_horizontal_8_dual_avx2( - uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, - const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, - const uint8_t *_thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); - DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); - DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); - - __m256i p3 = _mm256_loadu_si256((__m256i *)(s - 4 * p)); - __m256i q3 = _mm256_loadu_si256((__m256i *)(s + 3 * p)); - __m256i p2 = _mm256_loadu_si256((__m256i *)(s - 3 * p)); - __m256i q2 = _mm256_loadu_si256((__m256i *)(s + 2 * p)); - __m256i p1 = _mm256_loadu_si256((__m256i *)(s - 2 * p)); - __m256i q1 = _mm256_loadu_si256((__m256i *)(s + 1 * p)); - __m256i p0 = _mm256_loadu_si256((__m256i *)(s - 1 * p)); - __m256i q0 = _mm256_loadu_si256((__m256i *)(s + 0 * p)); - - __m256i blimit, limit, thresh; - get_dual_limit(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, - &blimit, &limit, &thresh); - - __m256i t80; - if (bd == 8) { - t80 = _mm256_set1_epi16(0x80); - } else if (bd == 10) { - t80 = _mm256_set1_epi16(0x200); - } else { // bd == 12 - t80 = _mm256_set1_epi16(0x800); - } - - __m256i ps1, ps0, qs0, qs1; - ps1 = _mm256_subs_epi16(p1, t80); - ps0 = _mm256_subs_epi16(p0, t80); - qs0 = _mm256_subs_epi16(q0, t80); - qs1 = _mm256_subs_epi16(q1, t80); - - // filter_mask and hev_mask - __m256i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; - abs_p1p0 = _mm256_abs_epi16(_mm256_sub_epi16(p1, p0)); - abs_q1q0 = _mm256_abs_epi16(_mm256_sub_epi16(q1, q0)); - - abs_p0q0 = _mm256_abs_epi16(_mm256_sub_epi16(p0, q0)); - abs_p1q1 = _mm256_abs_epi16(_mm256_sub_epi16(p1, q1)); - __m256i flat = _mm256_max_epi16(abs_p1p0, abs_q1q0); - __m256i hev = _mm256_subs_epu16(flat, thresh); - const __m256i zero = _mm256_set1_epi16(0); - const __m256i ffff = _mm256_set1_epi16(0xFFFF); - hev = _mm256_xor_si256(_mm256_cmpeq_epi16(hev, zero), ffff); - - abs_p0q0 = _mm256_adds_epu16(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm256_srli_epi16(abs_p1q1, 1); - __m256i mask = - _mm256_subs_epu16(_mm256_adds_epu16(abs_p0q0, abs_p1q1), blimit); - mask = _mm256_xor_si256(_mm256_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - // So taking maximums continues to work: - - const __m256i one = _mm256_set1_epi16(1); - mask = _mm256_and_si256(mask, _mm256_adds_epu16(limit, one)); - mask = _mm256_max_epi16(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - mask = _mm256_max_epi16(abs_q1q0, mask); - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p1)), - _mm256_abs_epi16(_mm256_sub_epi16(q2, q1))); - mask = _mm256_max_epi16(work, mask); - work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p2)), - _mm256_abs_epi16(_mm256_sub_epi16(q3, q2))); - mask = _mm256_max_epi16(work, mask); - mask = _mm256_subs_epu16(mask, limit); - mask = _mm256_cmpeq_epi16(mask, zero); - - // flat_mask4 - flat = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p2, p0)), - _mm256_abs_epi16(_mm256_sub_epi16(q2, q0))); - work = _mm256_max_epi16(_mm256_abs_epi16(_mm256_sub_epi16(p3, p0)), - _mm256_abs_epi16(_mm256_sub_epi16(q3, q0))); - flat = _mm256_max_epi16(work, flat); - flat = _mm256_max_epi16(abs_p1p0, flat); - flat = _mm256_max_epi16(abs_q1q0, flat); - - if (bd == 8) - flat = _mm256_subs_epu16(flat, one); - else if (bd == 10) - flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 2)); - else // bd == 12 - flat = _mm256_subs_epu16(flat, _mm256_slli_epi16(one, 4)); - - flat = _mm256_cmpeq_epi16(flat, zero); - flat = _mm256_and_si256(flat, mask); // flat & mask - - // Added before shift for rounding part of ROUND_POWER_OF_TWO - __m256i workp_a, workp_b, workp_shft; - workp_a = - _mm256_add_epi16(_mm256_add_epi16(p3, p3), _mm256_add_epi16(p2, p1)); - const __m256i four = _mm256_set1_epi16(4); - workp_a = _mm256_add_epi16(_mm256_add_epi16(workp_a, four), p0); - workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, p2), p3); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_op2[0], workp_shft); - - workp_b = _mm256_add_epi16(_mm256_add_epi16(q0, q1), p1); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_op1[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q2); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p1), p0); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_op0[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p3), q3); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, p0), q0); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_oq0[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p2), q3); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q0), q1); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_oq1[0], workp_shft); - - workp_a = _mm256_add_epi16(_mm256_sub_epi16(workp_a, p1), q3); - workp_b = _mm256_add_epi16(_mm256_sub_epi16(workp_b, q1), q2); - workp_shft = _mm256_srli_epi16(_mm256_add_epi16(workp_a, workp_b), 3); - _mm256_storeu_si256((__m256i *)&flat_oq2[0], workp_shft); - - // lp filter - const __m256i pmax = _mm256_subs_epi16( - _mm256_subs_epi16(_mm256_slli_epi16(one, bd), one), t80); - const __m256i pmin = _mm256_subs_epi16(zero, t80); - - __m256i filt, filter1, filter2, work_a; - filt = _mm256_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm256_and_si256(filt, hev); - work_a = _mm256_subs_epi16(qs0, ps0); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - filt = _mm256_adds_epi16(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm256_and_si256(filt, mask); - - const __m256i t4 = _mm256_set1_epi16(4); - const __m256i t3 = _mm256_set1_epi16(3); - - filter1 = _mm256_adds_epi16(filt, t4); - filter2 = _mm256_adds_epi16(filt, t3); - - // Filter1 >> 3 - pixel_clamp(&pmin, &pmax, &filter1); - filter1 = _mm256_srai_epi16(filter1, 3); - - // Filter2 >> 3 - pixel_clamp(&pmin, &pmax, &filter2); - filter2 = _mm256_srai_epi16(filter2, 3); - - // filt >> 1 - filt = _mm256_adds_epi16(filter1, one); - filt = _mm256_srai_epi16(filt, 1); - // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; - filt = _mm256_andnot_si256(hev, filt); - - work_a = _mm256_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - q0 = _mm256_loadu_si256((__m256i *)flat_oq0); - work_a = _mm256_andnot_si256(flat, work_a); - q0 = _mm256_and_si256(flat, q0); - q0 = _mm256_or_si256(work_a, q0); - - work_a = _mm256_subs_epi16(qs1, filt); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - q1 = _mm256_loadu_si256((__m256i *)flat_oq1); - work_a = _mm256_andnot_si256(flat, work_a); - q1 = _mm256_and_si256(flat, q1); - q1 = _mm256_or_si256(work_a, q1); - - work_a = _mm256_loadu_si256((__m256i *)(s + 2 * p)); - q2 = _mm256_loadu_si256((__m256i *)flat_oq2); - work_a = _mm256_andnot_si256(flat, work_a); - q2 = _mm256_and_si256(flat, q2); - q2 = _mm256_or_si256(work_a, q2); - - work_a = _mm256_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - p0 = _mm256_loadu_si256((__m256i *)flat_op0); - work_a = _mm256_andnot_si256(flat, work_a); - p0 = _mm256_and_si256(flat, p0); - p0 = _mm256_or_si256(work_a, p0); - - work_a = _mm256_adds_epi16(ps1, filt); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm256_adds_epi16(work_a, t80); - p1 = _mm256_loadu_si256((__m256i *)flat_op1); - work_a = _mm256_andnot_si256(flat, work_a); - p1 = _mm256_and_si256(flat, p1); - p1 = _mm256_or_si256(work_a, p1); - - work_a = _mm256_loadu_si256((__m256i *)(s - 3 * p)); - p2 = _mm256_loadu_si256((__m256i *)flat_op2); - work_a = _mm256_andnot_si256(flat, work_a); - p2 = _mm256_and_si256(flat, p2); - p2 = _mm256_or_si256(work_a, p2); - - _mm256_storeu_si256((__m256i *)(s - 3 * p), p2); - _mm256_storeu_si256((__m256i *)(s - 2 * p), p1); - _mm256_storeu_si256((__m256i *)(s - 1 * p), p0); - _mm256_storeu_si256((__m256i *)(s + 0 * p), q0); - _mm256_storeu_si256((__m256i *)(s + 1 * p), q1); - _mm256_storeu_si256((__m256i *)(s + 2 * p), q2); -} - -void aom_highbd_lpf_vertical_4_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); - uint16_t *src[2]; - uint16_t *dst[2]; - - // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - - // Loop filtering - aom_highbd_lpf_horizontal_4_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - src[0] = t_dst; - src[1] = t_dst + 8; - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - highbd_transpose(src, 16, dst, p, 2); -} - -void aom_highbd_lpf_vertical_8_dual_avx2( - uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, - const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); - uint16_t *src[2]; - uint16_t *dst[2]; - - // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); - - // Loop filtering - aom_highbd_lpf_horizontal_8_dual_avx2(t_dst + 4 * 16, 16, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - src[0] = t_dst; - src[1] = t_dst + 8; - - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - highbd_transpose(src, 16, dst, p, 2); -} -#endif // CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c index 0a399edf2..83e0098ba 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -11,29 +11,23 @@ #include // SSE2 -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/x86/lpf_common_sse2.h" -#include "aom_ports/emmintrin_compat.h" -#include "aom_ports/mem.h" +#include "config/aom_dsp_rtcd.h" -static INLINE void pixel_clamp(const __m128i *min, const __m128i *max, - __m128i *pixel) { - __m128i clamped, mask; +#include "aom_dsp/x86/lpf_common_sse2.h" - mask = _mm_cmpgt_epi16(*pixel, *max); - clamped = _mm_andnot_si128(mask, *pixel); - mask = _mm_and_si128(mask, *max); - clamped = _mm_or_si128(mask, clamped); +static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max, + __m128i *pixel) { + *pixel = _mm_min_epi16(*pixel, *max); + *pixel = _mm_max_epi16(*pixel, *min); +} - mask = _mm_cmpgt_epi16(clamped, *min); - clamped = _mm_and_si128(mask, clamped); - mask = _mm_andnot_si128(mask, *min); - *pixel = _mm_or_si128(clamped, mask); +static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) { + return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a)); } static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, const uint8_t *t, int bd, __m128i *blt, - __m128i *lt, __m128i *thr) { + __m128i *lt, __m128i *thr, __m128i *t80_out) { const int shift = bd - 8; const __m128i zero = _mm_setzero_si128(); @@ -45,6 +39,36 @@ static INLINE void get_limit(const uint8_t *bl, const uint8_t *l, x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero); *thr = _mm_slli_epi16(x, shift); + + *t80_out = _mm_set1_epi16(1 << (bd - 1)); +} + +static INLINE void get_limit_dual( + const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, + const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, + int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out, + __m128i *t80_out) { + const int shift = bd - 8; + const __m128i zero = _mm_setzero_si128(); + + __m128i x0 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero); + __m128i x1 = + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *blt_out = _mm_slli_epi16(x0, shift); + + x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero); + x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *lt_out = _mm_slli_epi16(x0, shift); + + x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero); + x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero); + x0 = _mm_unpacklo_epi64(x0, x1); + *thr_out = _mm_slli_epi16(x0, shift); + + *t80_out = _mm_set1_epi16(1 << (bd - 1)); } static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, @@ -55,115 +79,217 @@ static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch, q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch)); } } -// _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); -static INLINE void highbd_hev_mask(const __m128i *p, const __m128i *q, - const __m128i *t, __m128i *hev) { - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu16(p[1], p[0]), _mm_subs_epu16(p[0], p[1])); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu16(q[1], q[0]), _mm_subs_epu16(q[0], q[1])); - __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); - h = _mm_subs_epu16(h, *t); - const __m128i ffff = _mm_set1_epi16(0xFFFF); - const __m128i zero = _mm_setzero_si128(); - *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); -} - -static INLINE void highbd_filter_mask(const __m128i *p, const __m128i *q, - const __m128i *l, const __m128i *bl, - __m128i *mask) { - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu16(p[0], q[0]), _mm_subs_epu16(q[0], p[0])); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu16(p[1], q[1]), _mm_subs_epu16(q[1], p[1])); +static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q, + const __m128i *l, const __m128i *bl, + __m128i *mask) { + __m128i abs_p0q0 = abs_diff16(p[0], q[0]); + __m128i abs_p1q1 = abs_diff16(p[1], q[1]); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i ffff = _mm_set1_epi16(0xFFFF); + __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); int i; for (i = 1; i < 4; ++i) { - max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[i - 1]), - _mm_subs_epu16(p[i - 1], p[i]))); - max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[i - 1]), - _mm_subs_epu16(q[i - 1], q[i]))); + max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1])); + max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1])); } max = _mm_subs_epu16(max, *l); *mask = _mm_cmpeq_epi16(max, zero); // return ~mask } -static INLINE void flat_mask_internal(const __m128i *th, const __m128i *p, - const __m128i *q, int bd, int start, - int end, __m128i *flat) { - __m128i max = _mm_setzero_si128(); +static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x, + __m128i *p1p0, __m128i *q1q0, + __m128i *abs_p1p0, __m128i *l, + __m128i *bl, __m128i *t, + __m128i *hev, __m128i *mask) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i ffff = _mm_set1_epi16(0xFFFF); + __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0; + __m128i max, max01, h; + + *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]); + *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]); + + abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0); + abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2 + + max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl); + max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff); + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; + // So taking maximums continues to work: + max = _mm_and_si128(max, _mm_adds_epu16(*l, one)); + + *abs_p1p0 = abs_diff16(pq[0], pq[1]); + abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8); + max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0); + // mask |= (abs(*p1 - *p0) > limit) * -1; + // mask |= (abs(*q1 - *q0) > limit) * -1; + h = _mm_subs_epu16(max01, *t); + + *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); + // replicate for the further "merged variables" usage + *hev = _mm_unpacklo_epi64(*hev, *hev); + + max = _mm_max_epi16(max, max01); + int i; + for (i = 2; i < x; ++i) { + max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1])); + } + max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); + + max = _mm_subs_epu16(max, *l); + *mask = _mm_cmpeq_epi16(max, zero); // ~mask +} + +static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq, + int start, int end, __m128i *flat) { + int i; + __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]), + abs_diff16(pq[start + 1], pq[0])); + + for (i = start + 2; i < end; ++i) { + max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0])); + } + max = _mm_max_epi16(max, _mm_srli_si128(max, 8)); + + __m128i ft; + ft = _mm_subs_epu16(max, *th); + + const __m128i zero = _mm_setzero_si128(); + *flat = _mm_cmpeq_epi16(ft, zero); +} + +static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p, + const __m128i *q, int start, int end, + __m128i *flat) { int i; - for (i = start; i < end; ++i) { - max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(p[i], p[0]), - _mm_subs_epu16(p[0], p[i]))); - max = _mm_max_epi16(max, _mm_or_si128(_mm_subs_epu16(q[i], q[0]), - _mm_subs_epu16(q[0], q[i]))); + __m128i max = + _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0])); + + for (i = start + 1; i < end; ++i) { + max = _mm_max_epi16(max, abs_diff16(p[i], p[0])); + max = _mm_max_epi16(max, abs_diff16(q[i], q[0])); } __m128i ft; - if (bd == 8) - ft = _mm_subs_epu16(max, *th); - else if (bd == 10) - ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 2)); - else // bd == 12 - ft = _mm_subs_epu16(max, _mm_slli_epi16(*th, 4)); + ft = _mm_subs_epu16(max, *th); const __m128i zero = _mm_setzero_si128(); *flat = _mm_cmpeq_epi16(ft, zero); } -// Note: -// Access p[3-1], p[0], and q[3-1], q[0] -static INLINE void highbd_flat_mask4(const __m128i *th, const __m128i *p, - const __m128i *q, __m128i *flat, int bd) { +static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat, + __m128i *flat2, int bd) { // check the distance 1,2,3 against 0 - flat_mask_internal(th, p, q, bd, 1, 4, flat); + __m128i th = _mm_set1_epi16(1); + th = _mm_slli_epi16(th, bd - 8); + flat_mask_internal(&th, pq, 1, 4, flat); + flat_mask_internal(&th, pq, 4, 7, flat2); } -// Note: -// access p[7-4], p[0], and q[7-4], q[0] -static INLINE void highbd_flat_mask5(const __m128i *th, const __m128i *p, - const __m128i *q, __m128i *flat, int bd) { - flat_mask_internal(th, p, q, bd, 4, 8, flat); +static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p, + const __m128i *q, __m128i *flat, + __m128i *flat2, int bd) { + // check the distance 1,2,3 against 0 + __m128i th = _mm_set1_epi16(1); + th = _mm_slli_epi16(th, bd - 8); + flat_mask_internal_dual(&th, p, q, 1, 4, flat); + flat_mask_internal_dual(&th, p, q, 4, 7, flat2); } -static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask, - const __m128i *th, int bd, __m128i *ps, - __m128i *qs) { - __m128i t80; - if (bd == 8) - t80 = _mm_set1_epi16(0x80); - else if (bd == 10) - t80 = _mm_set1_epi16(0x200); - else // bd == 12 - t80 = _mm_set1_epi16(0x800); - - __m128i ps0 = _mm_subs_epi16(p[0], t80); - __m128i ps1 = _mm_subs_epi16(p[1], t80); - __m128i qs0 = _mm_subs_epi16(q[0], t80); - __m128i qs1 = _mm_subs_epi16(q[1], t80); +static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0, __m128i *t80, + int bd) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i pmax = + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); + const __m128i pmin = _mm_subs_epi16(zero, *t80); + + const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4); + __m128i ps1ps0_work, qs1qs0_work, work; + __m128i filt, filter2filter1, filter2filt, filter1filt; + + ps1ps0_work = _mm_subs_epi16(*p1p0, *t80); + qs1qs0_work = _mm_subs_epi16(*q1q0, *t80); + + work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work); + pixel_clamp(&pmin, &pmax, &work); + filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev); + + filt = _mm_subs_epi16(filt, work); + filt = _mm_subs_epi16(filt, work); + filt = _mm_subs_epi16(filt, work); + // (aom_filter + 3 * (qs0 - ps0)) & mask + pixel_clamp(&pmin, &pmax, &filt); + filt = _mm_and_si128(filt, *mask); + filt = _mm_unpacklo_epi64(filt, filt); + + filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */ + pixel_clamp(&pmin, &pmax, &filter2filter1); + filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */ + filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1); + + // filt >> 1 + filt = _mm_adds_epi16(filt, one); + filt = _mm_srai_epi16(filt, 1); + filt = _mm_andnot_si128(*hev, filt); + + filter2filt = _mm_unpackhi_epi64(filter2filter1, filt); + filter1filt = _mm_unpacklo_epi64(filter2filter1, filt); + + qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt); + ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt); + + pixel_clamp(&pmin, &pmax, &qs1qs0_work); + pixel_clamp(&pmin, &pmax, &ps1ps0_work); + + *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80); + *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80); +} + +static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps, + __m128i *qs, const __m128i *mask, + const __m128i *th, int bd, + __m128i *t80) { + __m128i ps0 = _mm_subs_epi16(p[0], *t80); + __m128i ps1 = _mm_subs_epi16(p[1], *t80); + __m128i qs0 = _mm_subs_epi16(q[0], *t80); + __m128i qs1 = _mm_subs_epi16(q[1], *t80); const __m128i one = _mm_set1_epi16(1); const __m128i pmax = - _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); - const __m128i zero = _mm_setzero_si128(); - const __m128i pmin = _mm_subs_epi16(zero, t80); + _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80); + const __m128i zero = _mm_setzero_si128(); + const __m128i pmin = _mm_subs_epi16(zero, *t80); __m128i filter = _mm_subs_epi16(ps1, qs1); pixel_clamp(&pmin, &pmax, &filter); + // hev_filter __m128i hev; - highbd_hev_mask(p, q, th, &hev); + const __m128i abs_p1p0 = abs_diff16(p[1], p[0]); + const __m128i abs_q1q0 = abs_diff16(q[1], q[0]); + __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0); + h = _mm_subs_epu16(h, *th); + const __m128i ffff = _mm_cmpeq_epi16(h, h); + hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff); + filter = _mm_and_si128(filter, hev); const __m128i x = _mm_subs_epi16(qs0, ps0); @@ -172,145 +298,332 @@ static INLINE void highbd_filter4(__m128i *p, __m128i *q, const __m128i *mask, filter = _mm_adds_epi16(filter, x); pixel_clamp(&pmin, &pmax, &filter); filter = _mm_and_si128(filter, *mask); - const __m128i t3 = _mm_set1_epi16(3); const __m128i t4 = _mm_set1_epi16(4); - __m128i filter1 = _mm_adds_epi16(filter, t4); __m128i filter2 = _mm_adds_epi16(filter, t3); pixel_clamp(&pmin, &pmax, &filter1); pixel_clamp(&pmin, &pmax, &filter2); filter1 = _mm_srai_epi16(filter1, 3); filter2 = _mm_srai_epi16(filter2, 3); - qs0 = _mm_subs_epi16(qs0, filter1); pixel_clamp(&pmin, &pmax, &qs0); ps0 = _mm_adds_epi16(ps0, filter2); pixel_clamp(&pmin, &pmax, &ps0); - - qs[0] = _mm_adds_epi16(qs0, t80); - ps[0] = _mm_adds_epi16(ps0, t80); - + qs[0] = _mm_adds_epi16(qs0, *t80); + ps[0] = _mm_adds_epi16(ps0, *t80); filter = _mm_adds_epi16(filter1, one); filter = _mm_srai_epi16(filter, 1); filter = _mm_andnot_si128(hev, filter); - qs1 = _mm_subs_epi16(qs1, filter); pixel_clamp(&pmin, &pmax, &qs1); ps1 = _mm_adds_epi16(ps1, filter); pixel_clamp(&pmin, &pmax, &ps1); - - qs[1] = _mm_adds_epi16(qs1, t80); - ps[1] = _mm_adds_epi16(ps1, t80); + qs[1] = _mm_adds_epi16(qs1, *t80); + ps[1] = _mm_adds_epi16(ps1, *t80); } -typedef enum { FOUR_PIXELS, EIGHT_PIXELS } PixelOutput; - -static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd, - PixelOutput pixel_output) { +static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( + __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, + const unsigned char *lt, const unsigned char *thr, int bd) { + int i; __m128i blimit, limit, thresh; - get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh); + __m128i t80; + get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); + + for (i = 0; i < 7; i++) { + pq[i] = _mm_unpacklo_epi64(p[i], q[i]); + } + __m128i mask, hevhev; + __m128i p1p0, q1q0, abs_p1p0; - __m128i p[8], q[8]; - load_highbd_pixel(s, 8, pitch, p, q); + highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hevhev, &mask); - __m128i mask; - highbd_filter_mask(p, q, &limit, &blimit, &mask); + __m128i ps0ps1, qs0qs1; + // filter4 + highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd); __m128i flat, flat2; - const __m128i one = _mm_set1_epi16(1); - highbd_flat_mask4(&one, p, q, &flat, bd); - highbd_flat_mask5(&one, p, q, &flat2, bd); + highbd_flat_mask4_sse2(pq, &flat, &flat2, bd); flat = _mm_and_si128(flat, mask); flat2 = _mm_and_si128(flat2, flat); - __m128i ps[2], qs[2]; - highbd_filter4(p, q, &mask, &thresh, bd, ps, qs); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + flat2 = _mm_unpacklo_epi64(flat2, flat2); // flat and wide flat calculations - __m128i flat_p[3], flat_q[3]; - __m128i flat2_p[7], flat2_q[7]; + __m128i flat_p[3], flat_q[3], flat_pq[3]; + __m128i flat2_p[6], flat2_q[6]; + __m128i flat2_pq[6]; { + __m128i work0; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); + __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); + __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); + flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0])); + flat2_q[0] = + _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0])); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); + + __m128i sum_p6, sum_p3; + sum_p6 = _mm_add_epi16(pq[6], pq[6]); + sum_p3 = _mm_add_epi16(pq[3], pq[3]); + + sum_q = _mm_sub_epi16(sum_p, p[5]); + sum_p = _mm_sub_epi16(sum_p, q[5]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); + flat2_p[1] = _mm_add_epi16(sum_p, work0); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + + sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lp = _mm_sub_epi16(sum_lp, q[2]); + + work0 = _mm_add_epi16(sum_p3, pq[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + + flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); - __m128i sum_p = - _mm_add_epi16(_mm_add_epi16(p[6], p[5]), _mm_add_epi16(p[4], p[3])); - __m128i sum_q = - _mm_add_epi16(_mm_add_epi16(q[6], q[5]), _mm_add_epi16(q[4], q[3])); + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + sum_lp = _mm_sub_epi16(sum_lp, q[1]); + sum_lq = _mm_sub_epi16(sum_lq, p[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq[3]); + work0 = _mm_add_epi16(sum_p3, pq[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + } + + // highbd_filter8 + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + + for (i = 0; i < 3; i++) { + pq[i] = _mm_andnot_si128(flat, pq[i]); + flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat_pq[i]); + } + + // highbd_filter16 + for (i = 5; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + pq[i] = _mm_andnot_si128(flat2, pq[i]); + flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); + // get values for when (flat2 && flat && mask) + pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + } +} + +void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch, + const uint8_t *blt, const uint8_t *lt, + const uint8_t *thr, int bd) { + __m128i p[7], q[7], pq[7]; + int i; + + for (i = 0; i < 7; i++) { + p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch)); + q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch)); + } + + highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd); + + for (i = 0; i < 6; i++) { + _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]); + _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8)); + } +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( + __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0, + const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, + const uint8_t *thr1, int bd) { + __m128i blimit, limit, thresh, t80; + get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, + &t80); + __m128i mask; + highbd_filter_mask_dual(p, q, &limit, &blimit, &mask); + __m128i flat, flat2; + highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd); + + flat = _mm_and_si128(flat, mask); + flat2 = _mm_and_si128(flat2, flat); + __m128i ps[2], qs[2]; + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); + // flat and wide flat calculations + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); + __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); sum_p = _mm_add_epi16(sum_p, sum_lp); - __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); sum_q = _mm_add_epi16(sum_q, sum_lq); sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - - flat2_p[0] = - _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(p[7], p[0])), 4); - flat2_q[0] = - _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(q[7], q[0])), 4); + flat2_p[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), + _mm_add_epi16(p[1], q[0]))), + 4); + flat2_q[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), + _mm_add_epi16(p[0], q[1]))), + 4); flat_p[0] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); flat_q[0] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3); - - __m128i sum_p7 = _mm_add_epi16(p[7], p[7]); - __m128i sum_q7 = _mm_add_epi16(q[7], q[7]); + __m128i sum_p6 = _mm_add_epi16(p[6], p[6]); + __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); - - sum_q = _mm_sub_epi16(sum_p, p[6]); - sum_p = _mm_sub_epi16(sum_p, q[6]); - flat2_p[1] = - _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[1])), 4); - flat2_q[1] = - _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[1])), 4); - + sum_q = _mm_sub_epi16(sum_p, p[5]); + sum_p = _mm_sub_epi16(sum_p, q[5]); + flat2_p[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, _mm_add_epi16( + sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), + 4); + flat2_q[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, _mm_add_epi16( + sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), + 4); sum_lq = _mm_sub_epi16(sum_lp, p[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); flat_p[1] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); flat_q[1] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); - - sum_p7 = _mm_add_epi16(sum_p7, p[7]); - sum_q7 = _mm_add_epi16(sum_q7, q[7]); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); sum_p3 = _mm_add_epi16(sum_p3, p[3]); sum_q3 = _mm_add_epi16(sum_q3, q[3]); - - sum_p = _mm_sub_epi16(sum_p, q[5]); - sum_q = _mm_sub_epi16(sum_q, p[5]); - flat2_p[2] = - _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[2])), 4); - flat2_q[2] = - _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[2])), 4); - + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); + flat2_p[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, _mm_add_epi16( + sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), + 4); + flat2_q[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, _mm_add_epi16( + sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), + 4); sum_lp = _mm_sub_epi16(sum_lp, q[1]); sum_lq = _mm_sub_epi16(sum_lq, p[1]); flat_p[2] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); flat_q[2] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); - - int i; - for (i = 3; i < 7; ++i) { - sum_p7 = _mm_add_epi16(sum_p7, p[7]); - sum_q7 = _mm_add_epi16(sum_q7, q[7]); - sum_p = _mm_sub_epi16(sum_p, q[7 - i]); - sum_q = _mm_sub_epi16(sum_q, p[7 - i]); - flat2_p[i] = - _mm_srli_epi16(_mm_add_epi16(sum_p, _mm_add_epi16(sum_p7, p[i])), 4); - flat2_q[i] = - _mm_srli_epi16(_mm_add_epi16(sum_q, _mm_add_epi16(sum_q7, q[i])), 4); - } + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + flat2_p[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, _mm_add_epi16( + sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), + 4); + flat2_q[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, _mm_add_epi16( + sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + flat2_p[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, _mm_add_epi16( + sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), + 4); + flat2_q[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, _mm_add_epi16( + sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + flat2_p[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, _mm_add_epi16( + sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), + 4); + flat2_q[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, _mm_add_epi16( + sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), + 4); } - // highbd_filter8 p[2] = _mm_andnot_si128(flat, p[2]); // p2 remains unchanged if !(flat && mask) @@ -320,7 +633,6 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, q[2] = _mm_andnot_si128(flat, q[2]); flat_q[2] = _mm_and_si128(flat, flat_q[2]); q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values - int i; for (i = 1; i >= 0; i--) { ps[i] = _mm_andnot_si128(flat, ps[i]); @@ -330,675 +642,979 @@ static INLINE void highbd_lpf_horz_edge_8_internal(uint16_t *s, int pitch, flat_q[i] = _mm_and_si128(flat, flat_q[i]); q[i] = _mm_or_si128(qs[i], flat_q[i]); } - // highbd_filter16 - - if (pixel_output == FOUR_PIXELS) { - for (i = 6; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm_andnot_si128(flat2, p[i]); - flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - - q[i] = _mm_andnot_si128(flat2, q[i]); - flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); - q[i] = _mm_or_si128(q[i], flat2_q[i]); - _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), p[i]); - _mm_storel_epi64((__m128i *)(s + i * pitch), q[i]); - } - } else { // EIGHT_PIXELS - for (i = 6; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm_andnot_si128(flat2, p[i]); - flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - - q[i] = _mm_andnot_si128(flat2, q[i]); - flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); - q[i] = _mm_or_si128(q[i], flat2_q[i]); - _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); - _mm_store_si128((__m128i *)(s + i * pitch), q[i]); - } + for (i = 5; i >= 0; i--) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); } } -// Note: -// highbd_lpf_horz_edge_8_8p() output 8 pixels per register -// highbd_lpf_horz_edge_8_4p() output 4 pixels per register -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 -static INLINE void highbd_lpf_horz_edge_8_4p(uint16_t *s, int pitch, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd) { - highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, FOUR_PIXELS); -} -#endif // #if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - -static INLINE void highbd_lpf_horz_edge_8_8p(uint16_t *s, int pitch, - const uint8_t *blt, - const uint8_t *lt, - const uint8_t *thr, int bd) { - highbd_lpf_horz_edge_8_internal(s, pitch, blt, lt, thr, bd, EIGHT_PIXELS); -} - -void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); -#else - highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); -#endif -} - -void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - highbd_lpf_horz_edge_8_4p(s, p, _blimit, _limit, _thresh, bd); -#else - highbd_lpf_horz_edge_8_8p(s, p, _blimit, _limit, _thresh, bd); - highbd_lpf_horz_edge_8_8p(s + 8, p, _blimit, _limit, _thresh, bd); -#endif -} - -static INLINE void store_horizontal_8(const __m128i *p2, const __m128i *p1, - const __m128i *p0, const __m128i *q0, - const __m128i *q1, const __m128i *q2, - int p, uint16_t *s) { -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - _mm_storel_epi64((__m128i *)(s - 3 * p), *p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), *p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), *p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), *q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), *q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), *q2); -#else - _mm_store_si128((__m128i *)(s - 3 * p), *p2); - _mm_store_si128((__m128i *)(s - 2 * p), *p1); - _mm_store_si128((__m128i *)(s - 1 * p), *p0); - _mm_store_si128((__m128i *)(s + 0 * p), *q0); - _mm_store_si128((__m128i *)(s + 1 * p), *q1); - _mm_store_si128((__m128i *)(s + 2 * p), *q2); -#endif +void aom_highbd_lpf_horizontal_14_dual_sse2( + uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p[7], q[7]; + int i; + load_highbd_pixel(s, 7, pitch, p, q); + + highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1, + _limit1, _thresh1, bd); + + for (i = 0; i < 6; i++) { + _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_store_si128((__m128i *)(s + i * pitch), q[i]); + } } -void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); - DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); - DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); - DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); +static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( + __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, + __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit, + const uint8_t *_limit, const uint8_t *_thresh, int bd) { __m128i blimit, limit, thresh; __m128i mask, hev, flat; - __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); - const __m128i one = _mm_set1_epi16(1); - const __m128i ffff = _mm_cmpeq_epi16(one, one); - __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; - const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_shft; + __m128i pq[3]; + __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0; + __m128i flat_p1p0, flat_q0q1; - const __m128i t4 = _mm_set1_epi16(4); - const __m128i t3 = _mm_set1_epi16(3); + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + pq[2] = _mm_unpacklo_epi64(*p2, *q2); + + const __m128i zero = _mm_setzero_si128(); + const __m128i four = _mm_set1_epi16(4); __m128i t80; - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i ps1, ps0, qs0, qs1; - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); - t80 = _mm_set1_epi16(0x80); - } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); - t80 = _mm_set1_epi16(0x200); - } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); - t80 = _mm_set1_epi16(0x800); + const __m128i one = _mm_set1_epi16(0x1); + + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + // flat_mask + flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); + flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); + + { + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), + _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 + workp_a = + _mm_add_epi16(workp_a, + workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 + + flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), + *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 + workp_b = _mm_add_epi16(*q1, *q2); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), + *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_b = _mm_add_epi16(*q2, *q2); + workp_shft1 = _mm_add_epi16( + workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + + flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); } + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); - ps1 = _mm_subs_epi16(p1, t80); - ps0 = _mm_subs_epi16(p0, t80); - qs0 = _mm_subs_epi16(q0, t80); - qs1 = _mm_subs_epi16(q1, t80); + qs1qs0 = _mm_andnot_si128(flat, qs1qs0); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - // filter_mask and hev_mask - abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); - abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + ps1ps0 = _mm_andnot_si128(flat, ps1ps0); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); +} - abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); - abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); - flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); +static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( + __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, + __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0, + const unsigned char *_thresh0, const unsigned char *_blimit1, + const unsigned char *_limit1, const unsigned char *_thresh1, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit0, limit0, thresh0; + __m128i t80; + __m128i mask, flat, work; + __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1; + __m128i op1, op0, oq0, oq1; + const __m128i four = _mm_set1_epi16(4); + const __m128i one = _mm_set1_epi16(0x1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + abs_p2p1 = abs_diff16(*p2, *p1); + abs_p1p0 = abs_diff16(*p1, *p0); + abs_q1q0 = abs_diff16(*q1, *q0); + abs_q2q1 = abs_diff16(*q2, *q1); + + abs_p0q0 = abs_diff16(*p0, *q0); + abs_p1q1 = abs_diff16(*p1, *q1); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); - mask = _mm_max_epi16(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - mask = _mm_max_epi16(abs_q1q0, mask); - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + + mask = _mm_max_epi16(abs_q2q1, mask); + work = _mm_max_epi16(abs_p1p0, abs_q1q0); mask = _mm_max_epi16(work, mask); - mask = _mm_subs_epu16(mask, limit); + mask = _mm_max_epi16(mask, abs_p2p1); + mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); + // flat_mask + flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); + flat = _mm_max_epi16(flat, work); + + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + { + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), + _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + op1 = _mm_srli_epi16(workp_shft0, 3); + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 + workp_a = + _mm_add_epi16(workp_a, + workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 + op0 = _mm_srli_epi16(workp_a, 3); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), + *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 + workp_b = _mm_add_epi16(*q1, *q2); + workp_shft0 = _mm_add_epi16( + workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + oq0 = _mm_srli_epi16(workp_shft0, 3); + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), + *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_b = _mm_add_epi16(*q2, *q2); + workp_shft1 = _mm_add_epi16( + workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + oq1 = _mm_srli_epi16(workp_shft1, 3); + } + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); +} + +void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out; + + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + + highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out, + _blimit, _limit, _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8)); +} + +void aom_highbd_lpf_horizontal_6_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i p2, p1, p0, q0, q1, q2; + + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + + highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, + _limit0, _thresh0, _blimit1, _limit1, + _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + const unsigned char *_blimit, const unsigned char *_limit, + const unsigned char *_thresh, int bd) { + const __m128i zero = _mm_setzero_si128(); + __m128i blimit, limit, thresh; + __m128i mask, hev, flat; + __m128i pq[4]; + __m128i p1p0, q1q0, ps1ps0, qs1qs0; + __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1; + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + pq[2] = _mm_unpacklo_epi64(*p2, *q2); + pq[3] = _mm_unpacklo_epi64(*p3, *q3); + + __m128i abs_p1p0; + + const __m128i four = _mm_set1_epi16(4); + __m128i t80; + const __m128i one = _mm_set1_epi16(0x1); + + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + // flat_mask4 - flat = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), - _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2))); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)), - _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); - flat = _mm_max_epi16(work, flat); + flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); flat = _mm_max_epi16(abs_p1p0, flat); - flat = _mm_max_epi16(abs_q1q0, flat); + flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); - if (bd == 8) - flat = _mm_subs_epu16(flat, one); - else if (bd == 10) - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); - else // bd == 12 - flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); flat = _mm_cmpeq_epi16(flat, zero); - flat = _mm_and_si128(flat, mask); // flat & mask + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); - // Added before shift for rounding part of ROUND_POWER_OF_TWO + { + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + // o*p2 + workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p1 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); + workp_shft0 = _mm_add_epi16(workp_a, workp_b); + + // o*p0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); + workp_shft0 = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); + oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + } + + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); - workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_op2[0], workp_shft); + qs1qs0 = _mm_andnot_si128(flat, qs1qs0); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_op1[0], workp_shft); + ps1ps0 = _mm_andnot_si128(flat, ps1ps0); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_op0[0], workp_shft); + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft); + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0, + const unsigned char *_limit0, const unsigned char *_thresh0, + const unsigned char *_blimit1, const unsigned char *_limit1, + const unsigned char *_thresh1, int bd) { + __m128i blimit0, limit0, thresh0; + __m128i t80; + __m128i mask, flat; + __m128i work_a, op2, oq2, op1, op0, oq0, oq1; + __m128i abs_p1q1, abs_p0q0, work0, work1, work2; + + const __m128i zero = _mm_setzero_si128(); + const __m128i four = _mm_set1_epi16(4); + const __m128i one = _mm_set1_epi16(0x1); + const __m128i ffff = _mm_cmpeq_epi16(one, one); + + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); + + abs_p0q0 = abs_diff16(*p0, *q0); + abs_p1q1 = abs_diff16(*p1, *q1); + + abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); + mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); + // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1; + + // So taking maximums continues to work: + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); + + work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1)); + work1 = + _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat + work0 = _mm_max_epi16(work0, work1); + work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3)); + work2 = _mm_max_epi16(work2, work0); + mask = _mm_max_epi16(work2, mask); + + mask = _mm_subs_epu16(mask, limit0); + mask = _mm_cmpeq_epi16(mask, zero); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft); + flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); + flat = _mm_max_epi16(work1, flat); + work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); + flat = _mm_max_epi16(work0, flat); - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft); + flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); + + flat = _mm_cmpeq_epi16(flat, zero); + flat = _mm_and_si128(flat, mask); // flat & mask + + { + __m128i workp_a, workp_b; + // Added before shift for rounding part of ROUND_POWER_OF_TWO + + // o*p2 + workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p1 + workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); + op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // o*p0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0); + op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0); + oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1); + oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); + oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + } // lp filter - const __m128i pmax = - _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); - const __m128i pmin = _mm_subs_epi16(zero, t80); + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } - filt = _mm_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); - filt = _mm_and_si128(filt, hev); - work_a = _mm_subs_epi16(qs0, ps0); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm_and_si128(filt, mask); + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); - filter1 = _mm_adds_epi16(filt, t4); - filter2 = _mm_adds_epi16(filt, t3); + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); - // Filter1 >> 3 - pixel_clamp(&pmin, &pmax, &filter1); - filter1 = _mm_srai_epi16(filter1, 3); + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); - // Filter2 >> 3 - pixel_clamp(&pmin, &pmax, &filter2); - filter2 = _mm_srai_epi16(filter2, 3); + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); - // filt >> 1 - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(hev, filt); - - work_a = _mm_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm_adds_epi16(work_a, t80); - q0 = _mm_load_si128((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, work_a); - q0 = _mm_and_si128(flat, q0); - q0 = _mm_or_si128(work_a, q0); - - work_a = _mm_subs_epi16(qs1, filt); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm_adds_epi16(work_a, t80); - q1 = _mm_load_si128((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, work_a); - q1 = _mm_and_si128(flat, q1); - q1 = _mm_or_si128(work_a, q1); - - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_load_si128((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - q2 = _mm_or_si128(work_a, q2); - - work_a = _mm_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm_adds_epi16(work_a, t80); - p0 = _mm_load_si128((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, work_a); - p0 = _mm_and_si128(flat, p0); - p0 = _mm_or_si128(work_a, p0); - - work_a = _mm_adds_epi16(ps1, filt); - pixel_clamp(&pmin, &pmax, &work_a); - work_a = _mm_adds_epi16(work_a, t80); - p1 = _mm_load_si128((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, work_a); - p1 = _mm_and_si128(flat, p1); - p1 = _mm_or_si128(work_a, p1); - - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_load_si128((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - p2 = _mm_or_si128(work_a, p2); - - store_horizontal_8(&p2, &p1, &p0, &q0, &q1, &q2, p, s); + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); +} + +void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0; + + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + + highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, + &p1p0, _blimit, _limit, _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); } void aom_highbd_lpf_horizontal_8_dual_sse2( uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { - aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); - aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s + 0 * p)); + + highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, + _blimit0, _limit0, _thresh0, _blimit1, + _limit1, _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); } -void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, - const uint8_t *_blimit, - const uint8_t *_limit, - const uint8_t *_thresh, int bd) { - const __m128i zero = _mm_set1_epi16(0); +static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out, + __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit, + const uint8_t *_thresh, int bd) { __m128i blimit, limit, thresh; - __m128i mask, hev, flat; -#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) - __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); -#endif - __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); -#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) - __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); -#endif - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); + __m128i mask, hev; + __m128i p1p0, q1q0; + __m128i pq[2]; + + __m128i abs_p1p0; + + __m128i t80; + get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80); + + pq[0] = _mm_unpacklo_epi64(*p0, *q0); + pq[1] = _mm_unpacklo_epi64(*p1, *q1); + + highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, + &thresh, &hev, &mask); + + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); +} + +static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps, + __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i blimit0, limit0, thresh0; + __m128i mask, flat; + __m128i p[2], q[2]; + + const __m128i zero = _mm_setzero_si128(); + __m128i abs_p0q0 = abs_diff16(*q0, *p0); + __m128i abs_p1q1 = abs_diff16(*q1, *p1); + + __m128i abs_p1p0 = abs_diff16(*p1, *p0); + __m128i abs_q1q0 = abs_diff16(*q1, *q0); + const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); const __m128i one = _mm_set1_epi16(1); - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); - const __m128i t4 = _mm_set1_epi16(4); - const __m128i t3 = _mm_set1_epi16(3); __m128i t80; - __m128i tff80; - __m128i tffe0; - __m128i t1f; - // equivalent to shifting 0x1f left by bitdepth - 8 - // and setting new bits to 1 - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i t7f; - // equivalent to shifting 0x7f left by bitdepth - 8 - // and setting new bits to 1 - __m128i ps1, ps0, qs0, qs1; - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - if (bd == 8) { - blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); - limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); - thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); - t80 = _mm_set1_epi16(0x80); - tff80 = _mm_set1_epi16(0xff80); - tffe0 = _mm_set1_epi16(0xffe0); - t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); - t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); - } else if (bd == 10) { - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); - t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); - tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2); - tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2); - t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); - t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); - } else { // bd == 12 - blimit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); - limit = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); - thresh = _mm_slli_epi16( - _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); - t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); - tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4); - tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4); - t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4); - t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); - } - ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd, + &blimit0, &limit0, &thresh0, &t80); // filter_mask and hev_mask flat = _mm_max_epi16(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu16(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); - mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); + + mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0); mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1; // So taking maximums continues to work: - mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); + mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one)); mask = _mm_max_epi16(flat, mask); -#if !(CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4) - __m128i work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), - _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); - mask = _mm_max_epi16(work, mask); - work = _mm_max_epi16( - _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), - _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); - mask = _mm_max_epi16(work, mask); -#endif - mask = _mm_subs_epu16(mask, limit); + mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); - // filter4 - const __m128i pmax = - _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80); - const __m128i pmin = _mm_subs_epi16(zero, t80); - - filt = _mm_subs_epi16(ps1, qs1); - pixel_clamp(&pmin, &pmax, &filt); - filt = _mm_and_si128(filt, hev); - work_a = _mm_subs_epi16(qs0, ps0); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - filt = _mm_adds_epi16(filt, work_a); - pixel_clamp(&pmin, &pmax, &filt); - - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi16(filt, t4); - pixel_clamp(&pmin, &pmax, &filter1); - - filter2 = _mm_adds_epi16(filt, t3); - pixel_clamp(&pmin, &pmax, &filter2); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0 - filter1 = _mm_and_si128(filter1, t1f); // clamp the range - filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; - // Filter2 >> 3 - work_a = _mm_cmpgt_epi16(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, tffe0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); +} - // filt >> 1 - filt = _mm_adds_epi16(filter1, t1); - work_a = _mm_cmpgt_epi16(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, tff80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - - filt = _mm_andnot_si128(hev, filt); - - q0 = _mm_subs_epi16(qs0, filter1); - pixel_clamp(&pmin, &pmax, &q0); - q0 = _mm_adds_epi16(q0, t80); - - q1 = _mm_subs_epi16(qs1, filt); - pixel_clamp(&pmin, &pmax, &q1); - q1 = _mm_adds_epi16(q1, t80); - - p0 = _mm_adds_epi16(ps0, filter2); - pixel_clamp(&pmin, &pmax, &p0); - p0 = _mm_adds_epi16(p0, t80); - - p1 = _mm_adds_epi16(ps1, filt); - pixel_clamp(&pmin, &pmax, &p1); - p1 = _mm_adds_epi16(p1, t80); -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); -#else - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); -#endif +void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, + const uint8_t *_blimit, + const uint8_t *_limit, + const uint8_t *_thresh, int bd) { + __m128i p1p0, q1q0; + __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + + highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit, + _thresh, bd); + + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); } void aom_highbd_lpf_horizontal_4_dual_sse2( uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1, int bd) { - aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); - aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); + __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + __m128i ps[2], qs[2]; + + highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0, + _thresh0, _blimit1, _limit1, _thresh1, bd); + + _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]); + _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]); + _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]); + _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]); } void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); - uint16_t *src[1]; - uint16_t *dst[1]; + __m128i x0, x1, x2, x3, d0, d1, d2, d3; + __m128i p1p0, q1q0; + __m128i p1, q1; - // Transpose 8x8 - src[0] = s - 4; - dst[0] = t_dst; + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); - highbd_transpose(src, p, dst, 8, 1); + highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3); - // Loop filtering - aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); + highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit, + thresh, bd); + + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); - src[0] = t_dst; - dst[0] = s - 4; + // transpose from 8x4 to 4x8 + highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); - // Transpose back - highbd_transpose(src, 8, dst, p, 1); + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); } void aom_highbd_lpf_vertical_4_dual_sse2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); - uint16_t *src[2]; - uint16_t *dst[2]; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i ps[2], qs[2]; - // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p)); + + highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, + &d2, &d3); + + highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0, + thresh0, blimit1, limit1, thresh1, bd); + + highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2, + &d3, &d4, &d5, &d6, &d7); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); + _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); + _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); + _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); + _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); +} - // Loop filtering - aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - src[0] = t_dst; - src[1] = t_dst + 8; - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; +void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x3, x2, x1, x0, p0, q0; + __m128i p1p0, q1q0; + + x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); + x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); + + highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit, + limit, thresh, bd); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); - // Transpose back - highbd_transpose(src, 16, dst, p, 2); + highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); +} + +void aom_highbd_lpf_vertical_6_dual_sse2( + uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1, int bd) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i p0, q0, p1, q1, p2, q2; + + x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p)); + x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p)); + x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p)); + + highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1, + &p0, &q0, &q1, &q2, &d6, &d7); + + highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0, + _limit0, _thresh0, _blimit1, _limit1, + _thresh1, bd); + + highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3); + _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4); + _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5); + _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6); + _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7); } void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); - uint16_t *src[1]; - uint16_t *dst[1]; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i p2, p1, p0, p3, q0; + __m128i q1q0, p1p0; - // Transpose 8x8 - src[0] = s - 4; - dst[0] = t_dst; + p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p)); + p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p)); + p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p)); + p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p)); - highbd_transpose(src, p, dst, 8, 1); + highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); // Loop filtering - aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); + highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, + &p1p0, blimit, limit, thresh, bd); - src[0] = t_dst; - dst[0] = s - 4; + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); - // Transpose back - highbd_transpose(src, 8, dst, p, 1); + highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, + &d1, &d2, &d3); + + _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0); + _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1); + _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2); + _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3); } void aom_highbd_lpf_vertical_8_dual_sse2( uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); - uint16_t *src[2]; - uint16_t *dst[2]; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + + x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p)); + x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p)); + x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p)); + x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p)); + x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p)); + x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p)); + x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p)); + + highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1, + &d2, &d3, &d4, &d5, &d6, &d7); + + highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, + blimit0, limit0, thresh0, blimit1, limit1, + thresh1, bd); + + highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1, + &x2, &x3, &x4, &x5, &x6, &x7); + + _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0); + _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1); + _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2); + _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3); + _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4); + _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5); + _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6); + _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7); +} - // Transpose 8x16 - highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); +void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + __m128i q[7], p[7], pq[7]; + __m128i p6, p5, p4, p3; + __m128i p6_2, p5_2, p4_2, p3_2; + __m128i d0, d1, d2, d3; + __m128i d0_2, d1_2, d2_2, d3_2, d7_2; - // Loop filtering - aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, - thresh0, blimit1, limit1, thresh1, bd); - src[0] = t_dst; - src[1] = t_dst + 8; + p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); + p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); + p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); + p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; + highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4], + &p[3], &p[2], &p[1], &p[0]); - // Transpose back - highbd_transpose(src, 16, dst, p, 2); -} + p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); + p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); -void aom_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); - uint16_t *src[2]; - uint16_t *dst[2]; + highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2], + &q[3], &q[4], &q[5], &q[6], &d7_2); - src[0] = s - 8; - src[1] = s; - dst[0] = t_dst; - dst[1] = t_dst + 8 * 8; + highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd); - // Transpose 16x8 - highbd_transpose(src, p, dst, 8, 2); + highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2], + &pq[1], &pq[0], &d0, &d1, &d2, &d3); - // Loop filtering - aom_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, - bd); - src[0] = t_dst; - src[1] = t_dst + 8 * 8; - dst[0] = s - 8; - dst[1] = s; - - // Transpose back - highbd_transpose(src, 8, dst, p, 2); -} - -void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh, int bd) { - DECLARE_ALIGNED(16, uint16_t, t_dst[256]); - - // Transpose 16x16 - highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); - -#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 - highbd_lpf_horz_edge_8_8p(t_dst + 8 * 16, 16, blimit, limit, thresh, bd); -#else - aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, - thresh, bd); -#endif - // Transpose back - highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); + q[0] = _mm_srli_si128(pq[0], 8); + q[1] = _mm_srli_si128(pq[1], 8); + q[2] = _mm_srli_si128(pq[2], 8); + q[3] = _mm_srli_si128(pq[3], 8); + q[4] = _mm_srli_si128(pq[4], 8); + q[5] = _mm_srli_si128(pq[5], 8); + + highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], + &d7_2, &d0_2, &d1_2, &d2_2, &d3_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0); + _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2); + + _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2); +} + +void aom_highbd_lpf_vertical_14_dual_sse2( + uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + __m128i q[7], p[7]; + __m128i p6, p5, p4, p3, p2, p1, p0, q0; + __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2; + __m128i d0, d7; + __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out; + + p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch)); + p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch)); + p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch)); + p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch)); + p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch)); + p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch)); + p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch)); + q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch)); + + highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6], + &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]); + + p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch)); + p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch)); + p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch)); + p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch)); + p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch)); + p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch)); + p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch)); + q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch)); + + highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2, + &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5], + &q[6], &d7); + + highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1, + limit1, thresh1, bd); + + highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0], + &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, + &d6_out, &d7_out); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out); + _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out); + _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out); + _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out); + _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out); + + highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7, + &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out, + &d6_out, &d7_out); + + _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out); + _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out); + _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out); + _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out); + _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out); + _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out); + _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out); + _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out); } diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c index 2bbf15ef2..dea113a29 100644 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c @@ -11,7 +11,8 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm index 855bc6558..e0d22522d 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm @@ -288,11 +288,9 @@ HIGH_SADNXN4D 8, 8 HIGH_SADNXN4D 8, 4 HIGH_SADNXN4D 4, 8 HIGH_SADNXN4D 4, 4 -%if CONFIG_EXT_PARTITION_TYPES HIGH_SADNXN4D 4, 16 HIGH_SADNXN4D 16, 4 HIGH_SADNXN4D 8, 32 HIGH_SADNXN4D 32, 8 HIGH_SADNXN4D 16, 64 HIGH_SADNXN4D 64, 16 -%endif diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm index 760e68aab..3398d8a2a 100644 --- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm @@ -158,10 +158,8 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2 HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2 -%endif ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -230,10 +228,8 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2 HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2 -%endif ; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -302,12 +298,10 @@ HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2 HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2 HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2 HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2 -%endif ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -376,7 +370,5 @@ HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2 HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2 -%endif diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm index ee19796e3..61f5b8e86 100644 --- a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -94,7 +94,7 @@ SECTION .text %define filter_idx_shift 5 -%ifdef PIC ; 64bit PIC +%if ARCH_X86_64 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ x_offset, y_offset, \ @@ -102,19 +102,20 @@ SECTION .text sec, sec_stride, height, sse %define sec_str sec_strideq %else - cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse %endif %define block_height heightd %define bilin_filter sseq %else - %if ARCH_X86=1 && CONFIG_PIC=1 + %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse, \ + g_bilin_filter, g_pw_8 %define block_height dword heightm %define sec_str sec_stridemp @@ -133,8 +134,9 @@ SECTION .text LOAD_IF_USED 0, 1 ; load eax, ecx back %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, height, \ - sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, \ + dst, dst_stride, height, sse, \ + g_bilin_filter, g_pw_8 %define block_height heightd ; Store bilin_filter and pw_8 location in stack @@ -153,22 +155,16 @@ SECTION .text %endif %else %if %2 == 1 ; avg - cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse - %if ARCH_X86_64 - %define block_height heightd - %define sec_str sec_strideq - %else + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse %define block_height dword heightm %define sec_str sec_stridemp - %endif %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, height, sse + x_offset, y_offset, \ + dst, dst_stride, height, sse %define block_height heightd %endif @@ -287,14 +283,14 @@ SECTION .text .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -311,7 +307,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -514,14 +510,14 @@ SECTION .text .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -538,7 +534,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -636,14 +632,14 @@ SECTION .text jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -660,7 +656,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -735,14 +731,14 @@ SECTION .text jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -759,7 +755,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -862,8 +858,8 @@ SECTION .text .x_nonhalf_y_nonhalf: ; loading filter - this is same as in 8-bit depth -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 shl y_offsetd, filter_idx_shift @@ -872,7 +868,7 @@ SECTION .text mova m9, [bilin_filter+x_offsetq+16] mova m10, [bilin_filter+y_offsetq] mova m11, [bilin_filter+y_offsetq+16] - mova m12, [pw_8] + mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 @@ -900,7 +896,7 @@ SECTION .text %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif ; end of load filter diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c index befd81269..18eb03d12 100644 --- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c @@ -13,8 +13,8 @@ #include #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride, const uint16_t *src, ptrdiff_t src_stride, @@ -204,21 +204,15 @@ SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); } SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); } SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); } SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); } -#if CONFIG_EXT_PARTITION SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); } SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); } SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); } -#endif SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); } SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); } SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); } SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); } SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); } SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); } -#if CONFIG_EXT_PARTITION -SUBTRACT_FUN(32x128) { STACK_V(64, subtract_32x64); } -SUBTRACT_FUN(128x32) { STACK_H(64, subtract_64x32); } -#endif static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { if (rows == 4) { @@ -244,25 +238,17 @@ static SubtractWxHFuncType getSubtractFunc(int rows, int cols) { if (cols == 16) return subtract_16x32; if (cols == 32) return subtract_32x32; if (cols == 64) return subtract_64x32; -#if CONFIG_EXT_PARTITION - if (cols == 128) return subtract_128x32; -#endif // CONFIG_EXT_PARTITION } if (rows == 64) { if (cols == 16) return subtract_16x64; if (cols == 32) return subtract_32x64; if (cols == 64) return subtract_64x64; -#if CONFIG_EXT_PARTITION if (cols == 128) return subtract_128x64; -#endif // CONFIG_EXT_PARTITION } -#if CONFIG_EXT_PARTITION if (rows == 128) { - if (cols == 32) return subtract_32x128; if (cols == 64) return subtract_64x128; if (cols == 128) return subtract_128x128; } -#endif // CONFIG_EXT_PARTITION assert(0); return NULL; } diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm index cf8ea498c..0d954e178 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm +++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm @@ -14,6 +14,8 @@ %include "aom_ports/x86_abi_support.asm" +SECTION .text + ;unsigned int aom_highbd_calc16x16var_sse2 ;( ; unsigned char * src_ptr, diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index 62acf3ed3..fdfadc886 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -12,13 +12,17 @@ #include #include // SSE2 -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" -#include "./av1_rtcd.h" #include "av1/common/filter.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/reconinter.h" typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride, @@ -185,13 +189,11 @@ VAR_FN(16, 16, 16, 8); VAR_FN(16, 8, 8, 7); VAR_FN(8, 16, 8, 7); VAR_FN(8, 8, 8, 6); -#if CONFIG_EXT_PARTITION_TYPES VAR_FN(16, 4, 16, 6); VAR_FN(8, 32, 8, 8); -VAR_FN(32, 8, 16, 8); +VAR_FN(32, 8, 8, 8); VAR_FN(16, 64, 16, 10); VAR_FN(64, 16, 16, 10); -#endif #undef VAR_FN @@ -398,7 +400,6 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#if CONFIG_EXT_PARTITION_TYPES #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ FN(64, 32, 16, 6, 5, opt, (int64_t)); \ @@ -416,20 +417,6 @@ DECLS(sse2); FN(32, 8, 16, 5, 3, opt, (int64_t)); \ FN(16, 64, 16, 4, 6, opt, (int64_t)); \ FN(64, 16, 16, 6, 4, opt, (int64_t)) -#else -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)) -#endif FNS(sse2); @@ -577,7 +564,6 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#if CONFIG_EXT_PARTITION_TYPES #define FNS(opt) \ FN(64, 64, 16, 6, 6, opt, (int64_t)); \ FN(64, 32, 16, 6, 5, opt, (int64_t)); \ @@ -595,30 +581,104 @@ DECLS(sse2); FN(32, 8, 16, 5, 3, opt, (int64_t)); \ FN(16, 64, 16, 4, 6, opt, (int64_t)); \ FN(64, 16, 16, 6, 4, opt, (int64_t)); -#else -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)); -#endif FNS(sse2); #undef FNS #undef FN -void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height, +void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, + const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint16_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + // Note: This is mostly a copy from the >=8X8 case in + // build_inter_predictors() function, with some small tweaks. + uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); + + // Some assumptions. + const int plane = 0; + + // Get pre-requisites. + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = pd->subsampling_x; + const int ssy = pd->subsampling_y; + assert(ssx == 0 && ssy == 0); + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + + // Calculate subpel_x/y and x/y_step. + const int row_start = 0; // Because ss_y is 0. + const int col_start = 0; // Because ss_x is 0. + const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; + int orig_pos_y = pre_y << SUBPEL_BITS; + orig_pos_y += mv->row * (1 << (1 - ssy)); + int orig_pos_x = pre_x << SUBPEL_BITS; + orig_pos_x += mv->col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + const uint8_t *const pre = + pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, + pos_x & SCALE_SUBPEL_MASK, + pos_y & SCALE_SUBPEL_MASK }; + + // Get warp types. + const WarpedMotionParams *const wm = + &xd->global_motion[mi->ref_frame[ref_num]]; + const int is_global = is_global_mv_block(mi, wm->wmtype); + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global; + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + // Get convolve parameters. + ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + const InterpFilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + // Get the inter predictor. + const int build_for_obmc = 0; + av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width, + &subpel_params, sf, width, height, &conv_params, + filters, &warp_types, mi_x >> pd->subsampling_x, + mi_y >> pd->subsampling_y, plane, ref_num, mi, + build_for_obmc, xd, cm->allow_warped_motion); + return; + } + } + + const InterpFilterParams filter = + av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + if (!subpel_x_q3 && !subpel_y_q3) { uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); if (width >= 8) { @@ -648,54 +708,48 @@ void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height, ref += 2 * ref_stride; } } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), + width, kernel, 16, NULL, -1, width, height, bd); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), + width, NULL, -1, kernel, 16, width, height, bd); } else { - InterpFilterParams filter; - filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); - if (!subpel_y_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_highbd_convolve8_horiz(ref8, ref_stride, - CONVERT_TO_BYTEPTR(comp_pred), width, kernel, - 16, NULL, -1, width, height, bd); - } else if (!subpel_x_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, NULL, -1, kernel, 16, width, height, bd); - } else { - DECLARE_ALIGNED(16, uint16_t, - temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); - const int16_t *kernel_x; - const int16_t *kernel_y; - int intermediate_height; - kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; - assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), - ref_stride, CONVERT_TO_BYTEPTR(temp), - MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height, bd); - aom_highbd_convolve8_vert( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), - MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, - 16, width, height, bd); - } + DECLARE_ALIGNED(16, uint16_t, + temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), + ref_stride, CONVERT_TO_BYTEPTR(temp), + MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height, bd); + aom_highbd_convolve8_vert( + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), + MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, + 16, width, height, bd); } } -void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, - const uint8_t *pred8, int width, - int height, int subpel_x_q3, - int subpel_y_q3, - const uint8_t *ref8, - int ref_stride, int bd) { +void aom_highbd_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd) { uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); int n; int i; - aom_highbd_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, - ref8, ref_stride, bd); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd); /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ assert(!(width * height & 7)); n = width * height >> 3; @@ -707,3 +761,102 @@ void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, pred += 8; } } + +static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w0, + const __m128i *w1, + const __m128i *r, + void *const result) { + assert(DIST_PRECISION_BITS <= 4); + __m128i mult0 = _mm_mullo_epi16(*p0, *w0); + __m128i mult1 = _mm_mullo_epi16(*p1, *w1); + __m128i sum = _mm_adds_epu16(mult0, mult1); + __m128i round = _mm_adds_epu16(sum, *r); + __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS); + + xx_storeu_128(result, shift); +} + +void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, + const uint8_t *pred8, int width, + int height, const uint8_t *ref8, + int ref_stride, + const JNT_COMP_PARAMS *jcp_param) { + int i; + const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; + const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0); + const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + if (width >= 8) { + // Read 8 pixels one row at a time + assert(!(width & 7)); + for (i = 0; i < height; ++i) { + int j; + for (j = 0; j < width; j += 8) { + __m128i p0 = xx_loadu_128(ref); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + ref += 8; + } + ref += ref_stride - width; + } + } else { + // Read 4 pixels two rows at a time + assert(!(width & 3)); + for (i = 0; i < height; i += 2) { + __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); + __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + ref += 2 * ref_stride; + } + } +} + +void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) { + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + int n; + int i; + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, + bd); + assert(!(width * height & 7)); + n = width * height >> 3; + + const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; + const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; + const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0); + const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred); + __m128i p1 = xx_loadu_128(pred); + + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + + comp_pred += 8; + pred += 8; + } +} diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c index cc7f52811..6c247a91b 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c @@ -11,8 +11,8 @@ #include /* SSE4.1 */ -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/variance.h" #include "aom_dsp/aom_filter.h" diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c index 6b8922b8c..1e67d392e 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_avx2.c +++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c @@ -11,7 +11,20 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE __m256i dc_sum_64(const uint8_t *ref) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32)); + const __m256i zero = _mm256_setzero_si256(); + __m256i y0 = _mm256_sad_epu8(x0, zero); + __m256i y1 = _mm256_sad_epu8(x1, zero); + y0 = _mm256_add_epi64(y0, y1); + __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1); + y0 = _mm256_add_epi64(u0, y0); + u0 = _mm256_unpackhi_epi64(y0, y0); + return _mm256_add_epi16(y0, u0); +} static INLINE __m256i dc_sum_32(const uint8_t *ref) { const __m256i x = _mm256_loadu_si256((const __m256i *)ref); @@ -25,13 +38,31 @@ static INLINE __m256i dc_sum_32(const uint8_t *ref) { static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst, ptrdiff_t stride) { - int i; - for (i = 0; i < height; ++i) { + for (int i = 0; i < height; ++i) { _mm256_storeu_si256((__m256i *)dst, *r); dst += stride; } } +static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1, + int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r0); + _mm256_storeu_si256((__m256i *)(dst + 32), *r1); + dst += stride; + } +} + +static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm256_storeu_si256((__m256i *)dst, *r); + _mm256_storeu_si256((__m256i *)(dst + 32), *r); + dst += stride; + } +} + void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i sum_above = dc_sum_32(above); @@ -168,11 +199,58 @@ void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(left_sum); sum += 24; sum /= 48; - const __m256i row = _mm256_set1_epi8((uint8_t)sum); row_store_32xh(&row, 16, dst, stride); } +void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_32(above); + __m256i sum_left = dc_sum_64(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 48; + sum /= 96; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = dc_sum_64(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 64; + sum /= 128; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = dc_sum_32(left); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 48; + sum /= 96; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i sum_above = dc_sum_64(above); + __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left)); + sum_left = _mm256_add_epi16(sum_left, sum_above); + uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); + sum += 40; + sum /= 80; + const __m256i row = _mm256_set1_epi8((uint8_t)sum); + row_store_64xh(&row, 16, dst, stride); +} + void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -187,6 +265,62 @@ void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, row_store_32xh(&row, 16, dst, stride); } +void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 16, dst, stride); +} + void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -202,6 +336,63 @@ void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, row_store_32xh(&row, 16, dst, stride); } +void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(left); + (void)above; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(left); + (void)above; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(left); + (void)above; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i sum = dc_sum_16_sse2(left); + (void)above; + + const __m128i eight = _mm_set1_epi16(8); + sum = _mm_add_epi16(sum, eight); + sum = _mm_srai_epi16(sum, 4); + const __m128i zero = _mm_setzero_si128(); + const __m128i r = _mm_shuffle_epi8(sum, zero); + const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); + row_store_64xh(&row, 16, dst, stride); +} + void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -211,6 +402,42 @@ void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, row_store_32xh(&row, 16, dst, stride); } +void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m256i row = _mm256_set1_epi8((uint8_t)0x80); + row_store_64xh(&row, 16, dst, stride); +} + void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m256i row = _mm256_loadu_si256((const __m256i *)above); @@ -218,8 +445,39 @@ void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, row_store_32xh(&row, 16, dst, stride); } +void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row = _mm256_loadu_si256((const __m256i *)above); + (void)left; + row_store_32xh(&row, 64, dst, stride); +} + +void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 64, dst, stride); +} + +void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 32, dst, stride); +} + +void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); + const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); + (void)left; + row_store_32x2xh(&row0, &row1, 16, dst, stride); +} + // ----------------------------------------------------------------------------- -// TM_PRED +// PAETH_PRED // Return 16 16-bit pixels in one row (__m256i) static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top, @@ -336,6 +594,26 @@ void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, } } +void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + const __m256i top = get_top_vector(above); + + for (int j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (int i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + // Return 32 8-bit pixels in one row (__m256i) static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, const __m256i *top1, @@ -411,3 +689,123 @@ void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, rep = _mm256_add_epi16(rep, one); } } + +void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 2; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i, j; + for (j = 0; j < 4; ++j) { + const __m256i l = get_left_vector(left + j * 16); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m256i t0 = get_top_vector(above); + const __m256i t1 = get_top_vector(above + 16); + const __m256i t2 = get_top_vector(above + 32); + const __m256i t3 = get_top_vector(above + 48); + const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]); + const __m256i one = _mm256_set1_epi16(1); + + int i; + const __m256i l = get_left_vector(left); + __m256i rep = _mm256_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + const __m256i l16 = _mm256_shuffle_epi8(l, rep); + + const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); + const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); + const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); + const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + + dst += stride; + rep = _mm256_add_epi16(rep, one); + } +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c index 2a83b9001..5b2452c8e 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_sse2.c +++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c @@ -11,11 +11,11 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" -static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) { - int i; - for (i = 0; i < 4; ++i) { +static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; i += 2) { *(uint32_t *)dst = dc; dst += stride; *(uint32_t *)dst = dc; @@ -51,6 +51,17 @@ static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst, } } +static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst, + ptrdiff_t stride) { + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, *row); + _mm_store_si128((__m128i *)(dst + 16), *row); + _mm_store_si128((__m128i *)(dst + 32), *row); + _mm_store_si128((__m128i *)(dst + 48), *row); + dst += stride; + } +} + static INLINE __m128i dc_sum_4(const uint8_t *ref) { __m128i x = _mm_loadl_epi64((__m128i const *)ref); const __m128i zero = _mm_setzero_si128(); @@ -83,6 +94,34 @@ static INLINE __m128i dc_sum_32(const uint8_t *ref) { return _mm_add_epi16(x0, high); } +static INLINE __m128i dc_sum_64(const uint8_t *ref) { + __m128i x0 = _mm_load_si128((__m128i const *)ref); + __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16)); + __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32)); + __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48)); + const __m128i zero = _mm_setzero_si128(); + x0 = _mm_sad_epu8(x0, zero); + x1 = _mm_sad_epu8(x1, zero); + x2 = _mm_sad_epu8(x2, zero); + x3 = _mm_sad_epu8(x3, zero); + x0 = _mm_add_epi16(x0, x1); + x2 = _mm_add_epi16(x2, x3); + x0 = _mm_add_epi16(x0, x2); + const __m128i high = _mm_unpackhi_epi64(x0, x0); + return _mm_add_epi16(x0, high); +} + +#define DC_MULTIPLIER_1X2 0x5556 +#define DC_MULTIPLIER_1X4 0x3334 + +#define DC_SHIFT2 16 + +static INLINE int divide_using_multiply_shift(int num, int shift1, + int multiplier) { + const int interm = num >> shift1; + return interm * multiplier >> DC_SHIFT2; +} + // ----------------------------------------------------------------------------- // DC_PRED @@ -94,11 +133,26 @@ void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 6; - sum /= 12; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); + + const __m128i row = _mm_set1_epi8((uint8_t)sum); + const uint32_t pred = _mm_cvtsi128_si32(row); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_16(left); + __m128i sum_above = dc_sum_4(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); const __m128i row = _mm_set1_epi8((uint8_t)sum); const uint32_t pred = _mm_cvtsi128_si32(row); - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -109,7 +163,7 @@ void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 6; - sum /= 12; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_8xh(&row, 4, dst, stride); @@ -123,11 +177,37 @@ void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 12; - sum /= 24; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_32(left); + __m128i sum_above = dc_sum_8(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_4(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 10; + sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i sum_left = dc_sum_8(left); @@ -136,7 +216,7 @@ void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 12; - sum /= 24; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_16xh(&row, 8, dst, stride); } @@ -149,11 +229,37 @@ void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 24; - sum /= 48; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i sum_left = dc_sum_64(left); + __m128i sum_above = dc_sum_16(above); + sum_above = _mm_add_epi16(sum_left, sum_above); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_8(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 20; + sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i sum_above = dc_sum_32(above); @@ -162,11 +268,63 @@ void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, uint32_t sum = _mm_cvtsi128_si32(sum_above); sum += 24; - sum /= 48; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2); const __m128i row = _mm_set1_epi8((uint8_t)sum); dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_32(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_64(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 64; + sum /= 128; + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_32(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 48; + sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i sum_above = dc_sum_64(above); + const __m128i sum_left = dc_sum_16(left); + sum_above = _mm_add_epi16(sum_above, sum_left); + + uint32_t sum = _mm_cvtsi128_si32(sum_above); + sum += 40; + sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4); + const __m128i row = _mm_set1_epi8((uint8_t)sum); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // DC_TOP @@ -181,7 +339,21 @@ void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, sum_above = _mm_packus_epi16(sum_above, sum_above); const uint32_t pred = _mm_cvtsi128_si32(sum_above); - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_4(above); + const __m128i two = _mm_set1_epi16((int16_t)2); + sum_above = _mm_add_epi16(sum_above, two); + sum_above = _mm_srai_epi16(sum_above, 2); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + sum_above = _mm_packus_epi16(sum_above, sum_above); + + const uint32_t pred = _mm_cvtsi128_si32(sum_above); + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -208,6 +380,31 @@ void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_8(above); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_above = _mm_add_epi16(sum_above, four); + sum_above = _mm_srai_epi16(sum_above, 3); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + const __m128i row = _mm_shufflelo_epi16(sum_above, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)left; @@ -235,6 +432,33 @@ void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_16(above); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_above = _mm_add_epi16(sum_above, eight); + sum_above = _mm_srai_epi16(sum_above, 4); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -249,6 +473,62 @@ void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // DC_LEFT @@ -263,7 +543,22 @@ void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, sum_left = _mm_packus_epi16(sum_left, sum_left); const uint32_t pred = _mm_cvtsi128_si32(sum_left); - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + sum_left = _mm_packus_epi16(sum_left, sum_left); + + const uint32_t pred = _mm_cvtsi128_si32(sum_left); + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -291,6 +586,33 @@ void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + const __m128i row = _mm_shufflelo_epi16(sum_left, 0); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_4(left); + const __m128i two = _mm_set1_epi16((uint16_t)2); + sum_left = _mm_add_epi16(sum_left, two); + sum_left = _mm_srai_epi16(sum_left, 2); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -319,6 +641,34 @@ void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_8(left); + const __m128i four = _mm_set1_epi16((uint16_t)4); + sum_left = _mm_add_epi16(sum_left, four); + sum_left = _mm_srai_epi16(sum_left, 3); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -333,15 +683,79 @@ void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_32xh(&row, 16, dst, stride); } -// ----------------------------------------------------------------------------- -// DC_128 - -void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { +void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { (void)above; - (void)left; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_64(left); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_left = _mm_add_epi16(sum_left, thirtytwo); + sum_left = _mm_srai_epi16(sum_left, 6); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_32(left); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_left = _mm_add_epi16(sum_left, sixteen); + sum_left = _mm_srai_epi16(sum_left, 5); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + __m128i sum_left = dc_sum_16(left); + const __m128i eight = _mm_set1_epi16((uint16_t)8); + sum_left = _mm_add_epi16(sum_left, eight); + sum_left = _mm_srai_epi16(sum_left, 4); + sum_left = _mm_unpacklo_epi8(sum_left, sum_left); + sum_left = _mm_shufflelo_epi16(sum_left, 0); + const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left); + dc_store_64xh(&row, 16, dst, stride); +} + +// ----------------------------------------------------------------------------- +// DC_128 + +void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; const uint32_t pred = 0x80808080; - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const uint32_t pred = 0x80808080; + dc_store_4xh(pred, 16, dst, stride); } void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -360,6 +774,22 @@ void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 4, dst, stride); +} + void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; @@ -377,6 +807,23 @@ void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } +void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_16xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 8, dst, stride); +} + void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -386,6 +833,42 @@ void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)above; + (void)left; + const __m128i row = _mm_set1_epi8((uint8_t)128); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // V_PRED @@ -393,7 +876,14 @@ void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint32_t pred = *(uint32_t *)above; (void)left; - dc_store_4x8(pred, dst, stride); + dc_store_4xh(pred, 8, dst, stride); +} + +void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint32_t pred = *(uint32_t *)above; + (void)left; + dc_store_4xh(pred, 16, dst, stride); } void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, @@ -410,6 +900,20 @@ void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_8xh(&row, 16, dst, stride); } +void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_loadl_epi64((__m128i const *)above); + (void)left; + dc_store_8xh(&row, 32, dst, stride); +} + +void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 4, dst, stride); +} + void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const __m128i row = _mm_load_si128((__m128i const *)above); @@ -424,19 +928,75 @@ void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, dc_store_16xh(&row, 32, dst, stride); } -void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, +void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const __m128i row = _mm_load_si128((__m128i const *)above); + (void)left; + dc_store_16xh(&row, 64, dst, stride); +} + +static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { const __m128i row0 = _mm_load_si128((__m128i const *)above); const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + for (int i = 0; i < height; ++i) { + _mm_store_si128((__m128i *)dst, row0); + _mm_store_si128((__m128i *)(dst + 16), row1); + dst += stride; + } +} + +void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { (void)left; - int i; - for (i = 0; i < 16; ++i) { + v_predictor_32xh(dst, stride, above, 8); +} + +void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 16); +} + +void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_32xh(dst, stride, above, 64); +} + +static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int height) { + const __m128i row0 = _mm_load_si128((__m128i const *)above); + const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16)); + const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32)); + const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48)); + for (int i = 0; i < height; ++i) { _mm_store_si128((__m128i *)dst, row0); _mm_store_si128((__m128i *)(dst + 16), row1); + _mm_store_si128((__m128i *)(dst + 32), row2); + _mm_store_si128((__m128i *)(dst + 48), row3); dst += stride; } } +void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 64); +} + +void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 32); +} + +void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)left; + v_predictor_64xh(dst, stride, above, 16); +} + // ----------------------------------------------------------------------------- // H_PRED @@ -471,25 +1031,7 @@ void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride, *(uint32_t *)dst = _mm_cvtsi128_si32(row3); } -void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - __m128i left_col = _mm_loadl_epi64((__m128i const *)left); - left_col = _mm_unpacklo_epi8(left_col, left_col); - __m128i row0 = _mm_shufflelo_epi16(left_col, 0); - __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); - __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); - __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); - dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); -} - -void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, +void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; const __m128i left_col = _mm_load_si128((__m128i const *)left); @@ -500,13 +1042,13 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); dst += stride; left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); @@ -514,26 +1056,26 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, row1 = _mm_shufflelo_epi16(left_col_low, 0x55); row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); row3 = _mm_shufflelo_epi16(left_col_low, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); dst += stride; row0 = _mm_shufflelo_epi16(left_col_high, 0); row1 = _mm_shufflelo_epi16(left_col_high, 0x55); row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); row3 = _mm_shufflelo_epi16(left_col_high, 0xff); - _mm_storel_epi64((__m128i *)dst, row0); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); dst += stride; - _mm_storel_epi64((__m128i *)dst, row1); + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); dst += stride; - _mm_storel_epi64((__m128i *)dst, row2); + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); dst += stride; - _mm_storel_epi64((__m128i *)dst, row3); + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); dst += stride; left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); @@ -541,6 +1083,24 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, row1 = _mm_shufflelo_epi16(left_col_high, 0x55); row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + *(uint32_t *)dst = _mm_cvtsi128_si32(row0); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row1); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row2); + dst += stride; + *(uint32_t *)dst = _mm_cvtsi128_si32(row3); +} + +void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + __m128i left_col = _mm_loadl_epi64((__m128i const *)left); + left_col = _mm_unpacklo_epi8(left_col, left_col); + __m128i row0 = _mm_shufflelo_epi16(left_col, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff); _mm_storel_epi64((__m128i *)dst, row0); dst += stride; _mm_storel_epi64((__m128i *)dst, row1); @@ -550,6 +1110,82 @@ void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, _mm_storel_epi64((__m128i *)dst, row3); } +static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int count) { + (void)above; + for (int i = 0; i < count; ++i) { + const __m128i left_col = _mm_load_si128((__m128i const *)left); + __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col); + __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col); + + __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0); + __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low); + row0 = _mm_shufflelo_epi16(left_col_low, 0); + row1 = _mm_shufflelo_epi16(left_col_low, 0x55); + row2 = _mm_shufflelo_epi16(left_col_low, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_low, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + + left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high); + row0 = _mm_shufflelo_epi16(left_col_high, 0); + row1 = _mm_shufflelo_epi16(left_col_high, 0x55); + row2 = _mm_shufflelo_epi16(left_col_high, 0xaa); + row3 = _mm_shufflelo_epi16(left_col_high, 0xff); + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); + dst += stride; + left += 16; + } +} + +void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 1); +} + +void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + h_predictor_8x16xc(dst, stride, above, left, 2); +} + static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst, ptrdiff_t stride) { int i; @@ -601,6 +1237,14 @@ static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst, h_pred_store_16xh(row, 4, dst, stride); } +void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + const __m128i left_col = _mm_loadl_epi64((const __m128i *)left); + const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p, dst, stride); +} + void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { (void)above; @@ -611,29 +1255,38 @@ void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride, h_prediction_16x8_2(&left_col_8p, dst, stride); } -void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - __m128i left_col, left_col_8p; - (void)above; +static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int count) { int i = 0; - do { - left_col = _mm_load_si128((const __m128i *)left); - left_col_8p = _mm_unpacklo_epi8(left_col, left_col); - h_prediction_16x8_1(&left_col_8p, dst, stride); + const __m128i left_col = _mm_load_si128((const __m128i *)left); + const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_lo, dst, stride); dst += stride << 2; - h_prediction_16x8_2(&left_col_8p, dst, stride); + h_prediction_16x8_2(&left_col_8p_lo, dst, stride); dst += stride << 2; - left_col_8p = _mm_unpackhi_epi8(left_col, left_col); - h_prediction_16x8_1(&left_col_8p, dst, stride); + const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col); + h_prediction_16x8_1(&left_col_8p_hi, dst, stride); dst += stride << 2; - h_prediction_16x8_2(&left_col_8p, dst, stride); + h_prediction_16x8_2(&left_col_8p_hi, dst, stride); dst += stride << 2; left += 16; i++; - } while (i < 2); + } while (i < count); +} + +void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 2); +} + +void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_16xh(dst, stride, left, 4); } static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst, @@ -664,6 +1317,19 @@ static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst, h_pred_store_32xh(row, 4, dst, stride); } +void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i left_col, left_col_8p; + (void)above; + + left_col = _mm_load_si128((const __m128i *)left); + + left_col_8p = _mm_unpacklo_epi8(left_col, left_col); + h_prediction_32x8_1(&left_col_8p, dst, stride); + dst += stride << 2; + h_prediction_32x8_2(&left_col_8p, dst, stride); +} + void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i left_col, left_col_8p; @@ -682,3 +1348,83 @@ void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride, dst += stride << 2; h_prediction_32x8_2(&left_col_8p, dst, stride); } + +static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_32xh(dst, stride, left, 64); +} + +static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int height) { + int i = height >> 2; + do { + __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]); + left4 = _mm_unpacklo_epi8(left4, left4); + left4 = _mm_unpacklo_epi8(left4, left4); + const __m128i r0 = _mm_shuffle_epi32(left4, 0x0); + const __m128i r1 = _mm_shuffle_epi32(left4, 0x55); + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r0); + _mm_store_si128((__m128i *)(dst + 32), r0); + _mm_store_si128((__m128i *)(dst + 48), r0); + _mm_store_si128((__m128i *)(dst + stride), r1); + _mm_store_si128((__m128i *)(dst + stride + 16), r1); + _mm_store_si128((__m128i *)(dst + stride + 32), r1); + _mm_store_si128((__m128i *)(dst + stride + 48), r1); + const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa); + const __m128i r3 = _mm_shuffle_epi32(left4, 0xff); + _mm_store_si128((__m128i *)(dst + stride * 2), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2); + _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2); + _mm_store_si128((__m128i *)(dst + stride * 3), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3); + _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3); + left += 4; + dst += stride * 4; + } while (--i); +} + +void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 64); +} + +void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 32); +} + +void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + (void)above; + h_predictor_64xh(dst, stride, left, 16); +} diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c index 85b82744e..807ed1770 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c @@ -11,11 +11,12 @@ #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/intrapred_common.h" // ----------------------------------------------------------------------------- -// TM_PRED +// PAETH_PRED // Return 8 16-bit pixels in one row static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top, @@ -82,6 +83,26 @@ void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, } } +void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_load_si128((const __m128i *)left); + const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_loadl_epi64((const __m128i *)left); @@ -145,6 +166,28 @@ void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, } } +void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i t = _mm_loadl_epi64((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i t16 = _mm_unpacklo_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + + for (int j = 0; j < 2; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16); + + _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row)); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + // Return 16 8-bit pixels in one row static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, const __m128i *top1, @@ -154,6 +197,27 @@ static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0, return _mm_packus_epi16(p0, p1); } +void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < 4; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { __m128i l = _mm_loadl_epi64((const __m128i *)left); @@ -234,6 +298,57 @@ void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, } } +void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i t = _mm_load_si128((const __m128i *)above); + const __m128i zero = _mm_setzero_si128(); + const __m128i top0 = _mm_unpacklo_epi8(t, zero); + const __m128i top1 = _mm_unpackhi_epi8(t, zero); + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + + for (int j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (int i = 0; i < 16; ++i) { + const __m128i l16 = _mm_shuffle_epi8(l, rep); + const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16); + _mm_store_si128((__m128i *)dst, row); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + __m128i rep = _mm_set1_epi16(0x8000); + const __m128i one = _mm_set1_epi16(1); + const __m128i l = _mm_loadl_epi64((const __m128i *)left); + __m128i l16; + + for (int i = 0; i < 8; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { @@ -307,6 +422,162 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, } } +void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + + _mm_store_si128((__m128i *)dst, r32l); + _mm_store_si128((__m128i *)(dst + 16), r32h); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 2; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i, j; + for (j = 0; j < 4; ++j) { + const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16)); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } + } +} + +void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + const __m128i a = _mm_load_si128((const __m128i *)above); + const __m128i b = _mm_load_si128((const __m128i *)(above + 16)); + const __m128i c = _mm_load_si128((const __m128i *)(above + 32)); + const __m128i d = _mm_load_si128((const __m128i *)(above + 48)); + const __m128i zero = _mm_setzero_si128(); + const __m128i al = _mm_unpacklo_epi8(a, zero); + const __m128i ah = _mm_unpackhi_epi8(a, zero); + const __m128i bl = _mm_unpacklo_epi8(b, zero); + const __m128i bh = _mm_unpackhi_epi8(b, zero); + const __m128i cl = _mm_unpacklo_epi8(c, zero); + const __m128i ch = _mm_unpackhi_epi8(c, zero); + const __m128i dl = _mm_unpacklo_epi8(d, zero); + const __m128i dh = _mm_unpackhi_epi8(d, zero); + + const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]); + const __m128i one = _mm_set1_epi16(1); + __m128i l16; + + int i; + const __m128i l = _mm_load_si128((const __m128i *)left); + __m128i rep = _mm_set1_epi16(0x8000); + for (i = 0; i < 16; ++i) { + l16 = _mm_shuffle_epi8(l, rep); + const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16); + const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16); + const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16); + const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16); + + _mm_store_si128((__m128i *)dst, r0); + _mm_store_si128((__m128i *)(dst + 16), r1); + _mm_store_si128((__m128i *)(dst + 32), r2); + _mm_store_si128((__m128i *)(dst + 48), r3); + dst += stride; + rep = _mm_add_epi16(rep, one); + } +} + // ----------------------------------------------------------------------------- // SMOOTH_PRED @@ -315,9 +586,15 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, // pixels[2]: right_pred vector static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, int height, __m128i *pixels) { - __m128i d = _mm_loadl_epi64((const __m128i *)above); + __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + if (height == 4) + pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + else if (height == 8) + pixels[1] = _mm_loadl_epi64(((const __m128i *)left)); + else + pixels[1] = _mm_loadu_si128(((const __m128i *)left)); + pixels[2] = _mm_set1_epi16((uint16_t)above[3]); - pixels[1] = _mm_loadl_epi64((const __m128i *)left); const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); const __m128i zero = _mm_setzero_si128(); @@ -325,45 +602,52 @@ static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left, pixels[0] = _mm_unpacklo_epi16(d, bp); } -// weights[0]: weights_h vector -// weights[1]: scale - weights_h vecotr -// weights[2]: weights_w and scale - weights_w interleave vector +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], second half for height = 16 only +// weight_h[3]: same as [1], second half for height = 16 only +// weight_w[0]: weights_w and scale - weights_w interleave vector static INLINE void load_weight_w4(const uint8_t *weight_array, int height, - __m128i *weights) { - __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]); + __m128i *weight_h, __m128i *weight_w) { const __m128i zero = _mm_setzero_si128(); - - weights[0] = _mm_unpacklo_epi8(t, zero); const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - weights[1] = _mm_sub_epi16(d, weights[0]); - weights[2] = _mm_unpacklo_epi16(weights[0], weights[1]); + const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]); + weight_h[0] = _mm_unpacklo_epi8(t, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); if (height == 8) { - t = _mm_srli_si128(t, 4); - weights[0] = _mm_unpacklo_epi8(t, zero); - weights[1] = _mm_sub_epi16(d, weights[0]); + const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + } else if (height == 16) { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); } } -static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *weight, - int h, uint8_t *dst, ptrdiff_t stride) { +static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh, + const __m128i *ww, int h, uint8_t *dst, + ptrdiff_t stride, int second_half) { const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); const __m128i one = _mm_set1_epi16(1); const __m128i inc = _mm_set1_epi16(0x202); const __m128i gat = _mm_set1_epi32(0xc080400); - __m128i rep = _mm_set1_epi16(0x8000); + __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); __m128i d = _mm_set1_epi16(0x100); - int i; - for (i = 0; i < h; ++i) { - const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d); - const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d); + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); __m128i s = _mm_madd_epi16(pixel[0], wh_sc); __m128i b = _mm_shuffle_epi8(pixel[1], rep); b = _mm_unpacklo_epi16(b, pixel[2]); - __m128i sum = _mm_madd_epi16(b, weight[2]); + __m128i sum = _mm_madd_epi16(b, ww[0]); sum = _mm_add_epi32(s, sum); sum = _mm_add_epi32(sum, round); @@ -383,10 +667,10 @@ void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, __m128i pixels[3]; load_pixel_w4(above, left, 4, pixels); - __m128i weights[3]; - load_weight_w4(sm_weight_arrays, 4, weights); + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 4, wh, ww); - smooth_pred_4xh(pixels, weights, 4, dst, stride); + smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0); } void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, @@ -394,33 +678,68 @@ void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, __m128i pixels[3]; load_pixel_w4(above, left, 8, pixels); - __m128i weights[3]; - load_weight_w4(sm_weight_arrays, 8, weights); + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 8, wh, ww); + + smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); +} + +void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[3]; + load_pixel_w4(above, left, 16, pixels); + + __m128i wh[4], ww[2]; + load_weight_w4(sm_weight_arrays, 16, wh, ww); - smooth_pred_4xh(pixels, weights, 8, dst, stride); + smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1); } // pixels[0]: above and below_pred interleave vector, first half // pixels[1]: above and below_pred interleave vector, second half // pixels[2]: left vector // pixels[3]: right_pred vector +// pixels[4]: above and below_pred interleave vector, first half +// pixels[5]: above and below_pred interleave vector, second half +// pixels[6]: left vector + 16 +// pixels[7]: right_pred vector static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left, int height, __m128i *pixels) { - __m128i d = _mm_loadl_epi64((const __m128i *)above); - pixels[3] = _mm_set1_epi16((uint16_t)above[7]); - pixels[2] = _mm_load_si128((const __m128i *)left); - const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); const __m128i zero = _mm_setzero_si128(); - + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + __m128i d = _mm_loadl_epi64((const __m128i *)above); d = _mm_unpacklo_epi8(d, zero); pixels[0] = _mm_unpacklo_epi16(d, bp); pixels[1] = _mm_unpackhi_epi16(d, bp); + + pixels[3] = _mm_set1_epi16((uint16_t)above[7]); + + if (height == 4) { + pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + } else if (height == 8) { + pixels[2] = _mm_loadl_epi64((const __m128i *)left); + } else if (height == 16) { + pixels[2] = _mm_load_si128((const __m128i *)left); + } else { + pixels[2] = _mm_load_si128((const __m128i *)left); + pixels[4] = pixels[0]; + pixels[5] = pixels[1]; + pixels[6] = _mm_load_si128((const __m128i *)(left + 16)); + pixels[7] = pixels[3]; + } } // weight_h[0]: weight_h vector // weight_h[1]: scale - weight_h vector -// weight_h[2]: same as [0], second half for height = 16 only -// weight_h[3]: same as [1], second half for height = 16 only +// weight_h[2]: same as [0], offset 8 +// weight_h[3]: same as [1], offset 8 +// weight_h[4]: same as [0], offset 16 +// weight_h[5]: same as [1], offset 16 +// weight_h[6]: same as [0], offset 24 +// weight_h[7]: same as [1], offset 24 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half // weight_w[1]: weights_w and scale - weights_w interleave vector, second half static INLINE void load_weight_w8(const uint8_t *weight_array, int height, @@ -429,7 +748,6 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height, const int we_offset = height < 8 ? 4 : 8; __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]); weight_h[0] = _mm_unpacklo_epi8(we, zero); - const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); weight_h[1] = _mm_sub_epi16(d, weight_h[0]); @@ -450,6 +768,19 @@ static INLINE void load_weight_w8(const uint8_t *weight_array, int height, weight_h[1] = _mm_sub_epi16(d, weight_h[0]); weight_h[2] = _mm_unpackhi_epi8(we, zero); weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } else if (height == 32) { + const __m128i weight_lo = + _mm_loadu_si128((const __m128i *)&weight_array[32]); + weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + const __m128i weight_hi = + _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); } } @@ -531,355 +862,831 @@ void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1); } -// pixels[0]: above and below_pred interleave vector, 1/4 -// pixels[1]: above and below_pred interleave vector, 2/4 -// pixels[2]: above and below_pred interleave vector, 3/4 -// pixels[3]: above and below_pred interleave vector, 3/4 -// pixels[4]: left vector -// pixels[5]: left vector, h = 32 only -// pixels[6]: right_pred vector -static INLINE void load_pixel_w16(const uint8_t *above, const uint8_t *left, - int height, __m128i *pixels) { - __m128i ab = _mm_load_si128((const __m128i *)above); - pixels[6] = _mm_set1_epi16((uint16_t)above[15]); - pixels[4] = _mm_load_si128((const __m128i *)left); - pixels[5] = _mm_load_si128((const __m128i *)(left + 16)); - const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); - const __m128i zero = _mm_setzero_si128(); +void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[8]; + load_pixel_w8(above, left, 32, pixels); - __m128i x = _mm_unpacklo_epi8(ab, zero); - pixels[0] = _mm_unpacklo_epi16(x, bp); - pixels[1] = _mm_unpackhi_epi16(x, bp); + __m128i wh[8], ww[2]; + load_weight_w8(sm_weight_arrays, 32, wh, ww); - x = _mm_unpackhi_epi8(ab, zero); - pixels[2] = _mm_unpacklo_epi16(x, bp); - pixels[3] = _mm_unpackhi_epi16(x, bp); + smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1); + dst += stride << 3; + smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1); } -// weight_h[0]: weight_h vector -// weight_h[1]: scale - weight_h vector -// weight_h[2]: same as [0], second half for height = 16 only -// weight_h[3]: same as [1], second half for height = 16 only -// ... ... -// weight_w[0]: weights_w and scale - weights_w interleave vector, first half -// weight_w[1]: weights_w and scale - weights_w interleave vector, second half -// ... ... -static INLINE void load_weight_w16(const uint8_t *weight_array, int height, - __m128i *weight_h, __m128i *weight_w) { - const __m128i zero = _mm_setzero_si128(); - __m128i w8 = _mm_loadu_si128((const __m128i *)&weight_array[8]); - __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]); - __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]); - __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); - const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); - - if (height == 8) { - weight_h[0] = _mm_unpacklo_epi8(w8, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); // scale - weight_h - - __m128i x = _mm_unpacklo_epi8(w16, zero); - __m128i y = _mm_sub_epi16(d, x); - weight_w[0] = _mm_unpacklo_epi16(x, y); - weight_w[1] = _mm_unpackhi_epi16(x, y); - x = _mm_unpackhi_epi8(w16, zero); - y = _mm_sub_epi16(d, x); - weight_w[2] = _mm_unpacklo_epi16(x, y); - weight_w[3] = _mm_unpackhi_epi16(x, y); - } - - if (height == 16) { - weight_h[0] = _mm_unpacklo_epi8(w16, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(w16, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); - - weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); - weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); - weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]); - weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]); +static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]); + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i top_right = + _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale)); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); + const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); + const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y); + __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left); + const __m128i wl_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0); + pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round); + pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); + const __m128i weights_x = + _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); + const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x); + const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero); + const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero); + + __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); + __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); + + const __m128i scale_m_weights_x = + _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero)); + const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right); + const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero); + const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero); + + pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl); + pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl); + + pred_lo = _mm_add_epi32(pred_lo, swxtr_lo); + pred_hi = _mm_add_epi32(pred_hi, swxtr_hi); + + pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale)); + pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale)); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; } +} - if (height == 32) { - weight_h[0] = _mm_unpacklo_epi8(w32_0, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(w32_0, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); - - __m128i x = _mm_unpacklo_epi8(w16, zero); - __m128i y = _mm_sub_epi16(d, x); - weight_w[0] = _mm_unpacklo_epi16(x, y); - weight_w[1] = _mm_unpackhi_epi16(x, y); - x = _mm_unpackhi_epi8(w16, zero); - y = _mm_sub_epi16(d, x); - weight_w[2] = _mm_unpacklo_epi16(x, y); - weight_w[3] = _mm_unpackhi_epi16(x, y); - - weight_h[4] = _mm_unpacklo_epi8(w32_1, zero); - weight_h[5] = _mm_sub_epi16(d, weight_h[4]); - weight_h[6] = _mm_unpackhi_epi8(w32_1, zero); - weight_h[7] = _mm_sub_epi16(d, weight_h[6]); - } +void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 4); } -static INLINE void smooth_pred_16x8(const __m128i *pixels, const __m128i *wh, - const __m128i *ww, uint8_t *dst, - ptrdiff_t stride, int quarter) { +void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 64, 16); +} + +void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_predictor_wxh(dst, stride, above, left, 16, 64); +} + +// ----------------------------------------------------------------------------- +// SMOOTH_V_PRED + +// pixels[0]: above and below_pred interleave vector +static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); +} + +// weights[0]: weights_h vector +// weights[1]: scale - weights_h vector +static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height == 4) { + const __m128i weight = + _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } else if (height == 8) { + const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + } else { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weights[0] = _mm_unpacklo_epi8(weight, zero); + weights[1] = _mm_sub_epi16(d, weights[0]); + weights[2] = _mm_unpackhi_epi8(weight, zero); + weights[3] = _mm_sub_epi16(d, weights[2]); + } +} + +static INLINE void smooth_v_pred_4xh(const __m128i *pixel, + const __m128i *weight, int h, uint8_t *dst, + ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i inc = _mm_set1_epi16(0x202); + const __m128i gat = _mm_set1_epi32(0xc080400); __m128i d = _mm_set1_epi16(0x100); - const __m128i one = _mm_set1_epi16(1); + + for (int i = 0; i < h; ++i) { + const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d); + const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d); + const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + __m128i sum = _mm_madd_epi16(pixel[0], wh_sc); + sum = _mm_add_epi32(sum, pred_round); + sum = _mm_srai_epi32(sum, sm_weight_log2_scale); + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; + d = _mm_add_epi16(d, inc); + } +} + +void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 4, &pixels); + + __m128i weights[2]; + load_weight_v_w4(sm_weight_arrays, 4, weights); + + smooth_v_pred_4xh(&pixels, weights, 4, dst, stride); +} + +void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 8, &pixels); + + __m128i weights[2]; + load_weight_v_w4(sm_weight_arrays, 8, weights); + + smooth_v_pred_4xh(&pixels, weights, 8, dst, stride); +} + +void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels; + load_pixel_v_w4(above, left, 16, &pixels); + + __m128i weights[4]; + load_weight_v_w4(sm_weight_arrays, 16, weights); + + smooth_v_pred_4xh(&pixels, weights, 8, dst, stride); + dst += stride << 3; + smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride); +} + +// pixels[0]: above and below_pred interleave vector, first half +// pixels[1]: above and below_pred interleave vector, second half +static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + const __m128i zero = _mm_setzero_si128(); + __m128i d = _mm_loadl_epi64((const __m128i *)above); + const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); + d = _mm_unpacklo_epi8(d, zero); + pixels[0] = _mm_unpacklo_epi16(d, bp); + pixels[1] = _mm_unpackhi_epi16(d, bp); +} + +// weight_h[0]: weight_h vector +// weight_h[1]: scale - weight_h vector +// weight_h[2]: same as [0], offset 8 +// weight_h[3]: same as [1], offset 8 +// weight_h[4]: same as [0], offset 16 +// weight_h[5]: same as [1], offset 16 +// weight_h[6]: same as [0], offset 24 +// weight_h[7]: same as [1], offset 24 +static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height, + __m128i *weight_h) { + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + + if (height < 16) { + const int offset = height < 8 ? 4 : 8; + const __m128i weight = + _mm_loadu_si128((const __m128i *)&weight_array[offset]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + } else if (height == 16) { + const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]); + weight_h[0] = _mm_unpacklo_epi8(weight, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + } else { + const __m128i weight_lo = + _mm_loadu_si128((const __m128i *)&weight_array[32]); + weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero); + weight_h[1] = _mm_sub_epi16(d, weight_h[0]); + weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero); + weight_h[3] = _mm_sub_epi16(d, weight_h[2]); + const __m128i weight_hi = + _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero); + weight_h[5] = _mm_sub_epi16(d, weight_h[4]); + weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero); + weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + } +} + +static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh, + int h, uint8_t *dst, ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); const __m128i inc = _mm_set1_epi16(0x202); const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); - __m128i rep = - (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008); - const __m128i left = (quarter < 2) ? pixels[4] : pixels[5]; + __m128i d = _mm_set1_epi16(0x100); - int i; - for (i = 0; i < 8; ++i) { + for (int i = 0; i < h; ++i) { const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc); __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc); - __m128i s2 = _mm_madd_epi16(pixels[2], wh_sc); - __m128i s3 = _mm_madd_epi16(pixels[3], wh_sc); - - __m128i b = _mm_shuffle_epi8(left, rep); - b = _mm_unpacklo_epi16(b, pixels[6]); - __m128i sum0 = _mm_madd_epi16(b, ww[0]); - __m128i sum1 = _mm_madd_epi16(b, ww[1]); - __m128i sum2 = _mm_madd_epi16(b, ww[2]); - __m128i sum3 = _mm_madd_epi16(b, ww[3]); - - s0 = _mm_add_epi32(s0, sum0); - s0 = _mm_add_epi32(s0, round); - s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale); - - s1 = _mm_add_epi32(s1, sum1); - s1 = _mm_add_epi32(s1, round); - s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale); - s2 = _mm_add_epi32(s2, sum2); - s2 = _mm_add_epi32(s2, round); - s2 = _mm_srai_epi32(s2, 1 + sm_weight_log2_scale); + s0 = _mm_add_epi32(s0, pred_round); + s0 = _mm_srai_epi32(s0, sm_weight_log2_scale); - s3 = _mm_add_epi32(s3, sum3); - s3 = _mm_add_epi32(s3, round); - s3 = _mm_srai_epi32(s3, 1 + sm_weight_log2_scale); - - sum0 = _mm_packus_epi16(s0, s1); - sum0 = _mm_shuffle_epi8(sum0, gat); - sum1 = _mm_packus_epi16(s2, s3); - sum1 = _mm_shuffle_epi8(sum1, gat); - - _mm_storel_epi64((__m128i *)dst, sum0); - _mm_storel_epi64((__m128i *)(dst + 8), sum1); + s1 = _mm_add_epi32(s1, pred_round); + s1 = _mm_srai_epi32(s1, sm_weight_log2_scale); + __m128i sum01 = _mm_packus_epi16(s0, s1); + sum01 = _mm_shuffle_epi8(sum01, gat); + _mm_storel_epi64((__m128i *)dst, sum01); dst += stride; - rep = _mm_add_epi16(rep, one); + d = _mm_add_epi16(d, inc); } } -void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[7]; - load_pixel_w16(above, left, 8, pixels); +void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 4, pixels); - __m128i wh[2], ww[4]; - load_weight_w16(sm_weight_arrays, 8, wh, ww); + __m128i wh[2]; + load_weight_v_w8(sm_weight_arrays, 4, wh); - smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + smooth_v_pred_8xh(pixels, wh, 4, dst, stride); } -void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, +void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - __m128i pixels[7]; - load_pixel_w16(above, left, 16, pixels); + __m128i pixels[2]; + load_pixel_v_w8(above, left, 8, pixels); + + __m128i wh[2]; + load_weight_v_w8(sm_weight_arrays, 8, wh); + + smooth_v_pred_8xh(pixels, wh, 8, dst, stride); +} - __m128i wh[4], ww[4]; - load_weight_w16(sm_weight_arrays, 16, wh, ww); +void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 16, pixels); - smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + __m128i wh[4]; + load_weight_v_w8(sm_weight_arrays, 16, wh); + + smooth_v_pred_8xh(pixels, wh, 8, dst, stride); dst += stride << 3; - smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1); + smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride); } -void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - __m128i pixels[7]; - load_pixel_w16(above, left, 32, pixels); +void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_v_w8(above, left, 32, pixels); - __m128i wh[8], ww[4]; - load_weight_w16(sm_weight_arrays, 32, wh, ww); + __m128i wh[8]; + load_weight_v_w8(sm_weight_arrays, 32, wh); - smooth_pred_16x8(pixels, wh, ww, dst, stride, 0); + smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride); dst += stride << 3; - smooth_pred_16x8(pixels, &wh[2], ww, dst, stride, 1); + smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride); dst += stride << 3; - smooth_pred_16x8(pixels, &wh[4], ww, dst, stride, 2); + smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride); dst += stride << 3; - smooth_pred_16x8(pixels, &wh[6], ww, dst, stride, 3); + smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride); } -static INLINE void load_pixel_w32(const uint8_t *above, const uint8_t *left, - int height, __m128i *pixels) { - __m128i ab0 = _mm_load_si128((const __m128i *)above); - __m128i ab1 = _mm_load_si128((const __m128i *)(above + 16)); +static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_h = sm_weight_arrays + bh; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i dup16 = _mm_set1_epi32(0x01000100); + const __m128i bottom_left = + _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i round = + _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1))); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]); + const __m128i scale_m_weights_y = + _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16); + const __m128i wl_y = + _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x)); + // 8 -> 16 + const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero); + const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y); + const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y); + // top_x * weights_y + scale_m_weights_y * bottom_left + __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y); + __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y); + + pred_lo = _mm_add_epi32(pred_lo, round); + pred_hi = _mm_add_epi32(pred_hi, round); + pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale); + pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} - pixels[10] = _mm_set1_epi16((uint16_t)above[31]); - pixels[8] = _mm_load_si128((const __m128i *)left); - pixels[9] = _mm_load_si128((const __m128i *)(left + 16)); +void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 4); +} - const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]); - const __m128i zero = _mm_setzero_si128(); +void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 64); +} - __m128i x = _mm_unpacklo_epi8(ab0, zero); - pixels[0] = _mm_unpacklo_epi16(x, bp); - pixels[1] = _mm_unpackhi_epi16(x, bp); +void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 32); +} - x = _mm_unpackhi_epi8(ab0, zero); - pixels[2] = _mm_unpacklo_epi16(x, bp); - pixels[3] = _mm_unpackhi_epi16(x, bp); +void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 64, 16); +} - x = _mm_unpacklo_epi8(ab1, zero); - pixels[4] = _mm_unpacklo_epi16(x, bp); - pixels[5] = _mm_unpackhi_epi16(x, bp); +void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_v_predictor_wxh(dst, stride, above, left, 16, 64); +} - x = _mm_unpackhi_epi8(ab1, zero); - pixels[6] = _mm_unpacklo_epi16(x, bp); - pixels[7] = _mm_unpackhi_epi16(x, bp); +// ----------------------------------------------------------------------------- +// SMOOTH_H_PRED + +// pixels[0]: left vector +// pixels[1]: right_pred vector +static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + if (height == 4) + pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + else if (height == 8) + pixels[0] = _mm_loadl_epi64(((const __m128i *)left)); + else + pixels[0] = _mm_loadu_si128(((const __m128i *)left)); + pixels[1] = _mm_set1_epi16((uint16_t)above[3]); } -static INLINE void load_weight_w32(const uint8_t *weight_array, int height, - __m128i *weight_h, __m128i *weight_w) { +// weights[0]: weights_w and scale - weights_w interleave vector +static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height, + __m128i *weights) { + (void)height; + const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]); const __m128i zero = _mm_setzero_si128(); - __m128i w16 = _mm_loadu_si128((const __m128i *)&weight_array[16]); - __m128i w32_0 = _mm_loadu_si128((const __m128i *)&weight_array[32]); - __m128i w32_1 = _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]); + + const __m128i weights_0 = _mm_unpacklo_epi8(t, zero); const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i weights_1 = _mm_sub_epi16(d, weights_0); + weights[0] = _mm_unpacklo_epi16(weights_0, weights_1); +} - if (height == 16) { - weight_h[0] = _mm_unpacklo_epi8(w16, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(w16, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); +static INLINE void smooth_h_pred_4xh(const __m128i *pixel, + const __m128i *weight, int h, uint8_t *dst, + ptrdiff_t stride) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + const __m128i one = _mm_set1_epi16(1); + const __m128i gat = _mm_set1_epi32(0xc080400); + __m128i rep = _mm_set1_epi16(0x8000); - __m128i x = _mm_unpacklo_epi8(w32_0, zero); - __m128i y = _mm_sub_epi16(d, x); - weight_w[0] = _mm_unpacklo_epi16(x, y); - weight_w[1] = _mm_unpackhi_epi16(x, y); + for (int i = 0; i < h; ++i) { + __m128i b = _mm_shuffle_epi8(pixel[0], rep); + b = _mm_unpacklo_epi16(b, pixel[1]); + __m128i sum = _mm_madd_epi16(b, weight[0]); - x = _mm_unpackhi_epi8(w32_0, zero); - y = _mm_sub_epi16(d, x); - weight_w[2] = _mm_unpacklo_epi16(x, y); - weight_w[3] = _mm_unpackhi_epi16(x, y); + sum = _mm_add_epi32(sum, pred_round); + sum = _mm_srai_epi32(sum, sm_weight_log2_scale); - x = _mm_unpacklo_epi8(w32_1, zero); - y = _mm_sub_epi16(d, x); - weight_w[4] = _mm_unpacklo_epi16(x, y); - weight_w[5] = _mm_unpackhi_epi16(x, y); + sum = _mm_shuffle_epi8(sum, gat); + *(uint32_t *)dst = _mm_cvtsi128_si32(sum); + dst += stride; - x = _mm_unpackhi_epi8(w32_1, zero); - y = _mm_sub_epi16(d, x); - weight_w[6] = _mm_unpacklo_epi16(x, y); - weight_w[7] = _mm_unpackhi_epi16(x, y); + rep = _mm_add_epi16(rep, one); } +} - if (height == 32) { - weight_h[0] = _mm_unpacklo_epi8(w32_0, zero); - weight_h[1] = _mm_sub_epi16(d, weight_h[0]); - weight_h[2] = _mm_unpackhi_epi8(w32_0, zero); - weight_h[3] = _mm_sub_epi16(d, weight_h[2]); +void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 4, pixels); - weight_h[4] = _mm_unpacklo_epi8(w32_1, zero); - weight_h[5] = _mm_sub_epi16(d, weight_h[4]); - weight_h[6] = _mm_unpackhi_epi8(w32_1, zero); - weight_h[7] = _mm_sub_epi16(d, weight_h[6]); + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 4, &weights); - weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]); - weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]); - weight_w[2] = _mm_unpacklo_epi16(weight_h[2], weight_h[3]); - weight_w[3] = _mm_unpackhi_epi16(weight_h[2], weight_h[3]); + smooth_h_pred_4xh(pixels, &weights, 4, dst, stride); +} + +void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 8, pixels); + + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 8, &weights); + + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); +} + +void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w4(above, left, 16, pixels); + + __m128i weights; + load_weight_h_w4(sm_weight_arrays, 8, &weights); + + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); + dst += stride << 3; - weight_w[4] = _mm_unpacklo_epi16(weight_h[4], weight_h[5]); - weight_w[5] = _mm_unpackhi_epi16(weight_h[4], weight_h[5]); - weight_w[6] = _mm_unpacklo_epi16(weight_h[6], weight_h[7]); - weight_w[7] = _mm_unpackhi_epi16(weight_h[6], weight_h[7]); + pixels[0] = _mm_srli_si128(pixels[0], 8); + smooth_h_pred_4xh(pixels, &weights, 8, dst, stride); +} + +// pixels[0]: left vector +// pixels[1]: right_pred vector +// pixels[2]: left vector + 16 +// pixels[3]: right_pred vector +static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left, + int height, __m128i *pixels) { + pixels[1] = _mm_set1_epi16((uint16_t)above[7]); + + if (height == 4) { + pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]); + } else if (height == 8) { + pixels[0] = _mm_loadl_epi64((const __m128i *)left); + } else if (height == 16) { + pixels[0] = _mm_load_si128((const __m128i *)left); + } else { + pixels[0] = _mm_load_si128((const __m128i *)left); + pixels[2] = _mm_load_si128((const __m128i *)(left + 16)); + pixels[3] = pixels[1]; } } -static INLINE void smooth_pred_32x8(const __m128i *pixels, const __m128i *wh, - const __m128i *ww, uint8_t *dst, - ptrdiff_t stride, int quarter) { - __m128i d = _mm_set1_epi16(0x100); +// weight_w[0]: weights_w and scale - weights_w interleave vector, first half +// weight_w[1]: weights_w and scale - weights_w interleave vector, second half +static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height, + __m128i *weight_w) { + (void)height; + const __m128i zero = _mm_setzero_si128(); + const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]); + const __m128i tmp1 = _mm_unpacklo_epi8(we, zero); + const __m128i tmp2 = _mm_sub_epi16(d, tmp1); + weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2); + weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2); +} + +static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww, + int h, uint8_t *dst, ptrdiff_t stride, + int second_half) { + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); const __m128i one = _mm_set1_epi16(1); - const __m128i inc = _mm_set1_epi16(0x202); const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); - const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale)); - __m128i rep = - (quarter % 2 == 0) ? _mm_set1_epi16(0x8000) : _mm_set1_epi16(0x8008); - const __m128i left = (quarter < 2) ? pixels[8] : pixels[9]; + __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000); - int i; - for (i = 0; i < 8; ++i) { - const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d); - const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d); - const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc); + for (int i = 0; i < h; ++i) { + __m128i b = _mm_shuffle_epi8(pixels[0], rep); + b = _mm_unpacklo_epi16(b, pixels[1]); + __m128i sum0 = _mm_madd_epi16(b, ww[0]); + __m128i sum1 = _mm_madd_epi16(b, ww[1]); - int j; - __m128i s[8]; - __m128i b = _mm_shuffle_epi8(left, rep); - b = _mm_unpacklo_epi16(b, pixels[10]); + sum0 = _mm_add_epi32(sum0, pred_round); + sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale); - for (j = 0; j < 8; ++j) { - s[j] = _mm_madd_epi16(pixels[j], wh_sc); - s[j] = _mm_add_epi32(s[j], _mm_madd_epi16(b, ww[j])); - s[j] = _mm_add_epi32(s[j], round); - s[j] = _mm_srai_epi32(s[j], 1 + sm_weight_log2_scale); - } + sum1 = _mm_add_epi32(sum1, pred_round); + sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale); - for (j = 0; j < 8; j += 2) { - __m128i sum = _mm_packus_epi16(s[j], s[j + 1]); - sum = _mm_shuffle_epi8(sum, gat); - _mm_storel_epi64((__m128i *)(dst + (j << 2)), sum); - } + sum0 = _mm_packus_epi16(sum0, sum1); + sum0 = _mm_shuffle_epi8(sum0, gat); + _mm_storel_epi64((__m128i *)dst, sum0); dst += stride; + rep = _mm_add_epi16(rep, one); - d = _mm_add_epi16(d, inc); } } -void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, +void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - __m128i pixels[11]; - load_pixel_w32(above, left, 16, pixels); + __m128i pixels[2]; + load_pixel_h_w8(above, left, 4, pixels); - __m128i wh[4], ww[8]; - load_weight_w32(sm_weight_arrays, 16, wh, ww); + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 4, ww); - smooth_pred_32x8(pixels, wh, ww, dst, stride, 0); - dst += stride << 3; - smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1); + smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0); } -void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, +void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - __m128i pixels[11]; - load_pixel_w32(above, left, 32, pixels); + __m128i pixels[2]; + load_pixel_h_w8(above, left, 8, pixels); - __m128i wh[8], ww[8]; - load_weight_w32(sm_weight_arrays, 32, wh, ww); + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 8, ww); - smooth_pred_32x8(pixels, &wh[0], ww, dst, stride, 0); + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0); +} + +void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[2]; + load_pixel_h_w8(above, left, 16, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 16, ww); + + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0); dst += stride << 3; - smooth_pred_32x8(pixels, &wh[2], ww, dst, stride, 1); + smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1); +} + +void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m128i pixels[4]; + load_pixel_h_w8(above, left, 32, pixels); + + __m128i ww[2]; + load_weight_h_w8(sm_weight_arrays, 32, ww); + + smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0); + dst += stride << 3; + smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1); dst += stride << 3; - smooth_pred_32x8(pixels, &wh[4], ww, dst, stride, 2); + smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0); dst += stride << 3; - smooth_pred_32x8(pixels, &wh[6], ww, dst, stride, 3); + smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1); +} + +static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left, uint32_t bw, + uint32_t bh) { + const uint8_t *const sm_weights_w = sm_weight_arrays + bw; + const __m128i zero = _mm_setzero_si128(); + const __m128i scale_value = + _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale)); + const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]); + const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200); + const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1))); + + for (uint32_t y = 0; y < bh; ++y) { + const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]); + const __m128i tr_ly = + _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0); + + for (uint32_t x = 0; x < bw; x += 8) { + const __m128i weights_x = + _mm_loadl_epi64((const __m128i *)(sm_weights_w + x)); + const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero); + const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw); + const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw); + const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw); + __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly); + __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly); + + pred_lo = _mm_add_epi32(pred_lo, pred_round); + pred_hi = _mm_add_epi32(pred_hi, pred_round); + + pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale); + pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale); + + __m128i pred = _mm_packus_epi16(pred_lo, pred_hi); + pred = _mm_shuffle_epi8(pred, gat); + _mm_storel_epi64((__m128i *)(dst + x), pred); + } + dst += stride; + } +} + +void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 4); +} + +void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 8); +} + +void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 16); +} + +void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 32); +} + +void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 16, 64); +} + +void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 8); +} + +void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 16); +} + +void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 32); +} + +void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 32, 64); +} + +void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 64); +} + +void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 32); +} + +void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + smooth_h_predictor_wxh(dst, stride, above, left, 64, 16); } diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm deleted file mode 100644 index bc1bb2ff3..000000000 --- a/third_party/aom/aom_dsp/x86/intrapred_ssse3_asm.asm +++ /dev/null @@ -1,410 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA - -pb_1: times 16 db 1 -sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 -sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 -sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 -sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -SECTION .text - -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 - pavgb %4, %1, %3 - pxor %3, %1 - pand %3, [GLOBAL(pb_1)] - psubb %4, %3 - pavgb %4, %2 -%endmacro - -INIT_XMM ssse3 -cglobal d63e_predictor_4x4, 3, 4, 5, dst, stride, above, goffset - GET_GOT goffsetq - - movq m3, [aboveq] - pshufb m1, m3, [GLOBAL(sh_b23456777)] - pshufb m2, m3, [GLOBAL(sh_b12345677)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 - pavgb m3, m2 - - ; store 4 lines - movd [dstq ], m3 - movd [dstq+strideq], m4 - lea dstq, [dstq+strideq*2] - psrldq m3, 1 - psrldq m4, 1 - movd [dstq ], m3 - movd [dstq+strideq], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - movd m0, [leftq] ; l1, l2, l3, l4 - movd m1, [aboveq-1] ; tl, t1, t2, t3 - punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 - pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 - psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 - psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 - ; comments below are for a predictor like this - ; A1 B1 C1 D1 - ; A2 B2 A1 B1 - ; A3 B3 A2 B2 - ; A4 B4 A3 B3 - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 - pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 - - punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. - movd [dstq+stride3q ], m3 - psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. - movd [dstq+strideq*2], m3 - psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. - movd [dstq+strideq ], m3 - psrldq m3, 2 ; A1 B1 C1 D1 .. - movd [dstq ], m3 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - movq m0, [leftq] ; [0- 7] l1-8 [byte] - movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] - pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] - pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] - pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] - pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] - psrldq m4, m0, 1 ; t1-7 [word] - psrldq m5, m0, 2 ; t2-7 [word] - ; comments below are for a predictor like this - ; A1 B1 C1 D1 E1 F1 G1 H1 - ; A2 B2 A1 B1 C1 D1 E1 F1 - ; A3 B3 A2 B2 A1 B1 C1 D1 - ; A4 B4 A3 B3 A2 B2 A1 B1 - ; A5 B5 A4 B4 A3 B3 A2 B2 - ; A6 B6 A5 B5 A4 B4 A3 B3 - ; A7 B7 A6 B6 A5 B5 A4 B4 - ; A8 B8 A7 B7 A6 B6 A5 B5 - pavgb m6, m1, m2 ; 2-tap avg A8-A1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 - - punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - - movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 - palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 - movq [dstq+strideq*2], m0 - psrldq m0, 2 ; A-B2, A-B1, C-H1 - movq [dstq+strideq ], m0 - psrldq m0, 2 ; A-H1 - movq [dstq ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 - psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 - movq [dstq+strideq*2], m6 - psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 - movq [dstq+strideq ], m6 - psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 - movq [dstq ], m6 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - mova m0, [leftq] - movu m7, [aboveq-1] - ; comments below are for a predictor like this - ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 - ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 - ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 - ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 - ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 - ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 - ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 - ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 - ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 - ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 - ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 - ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 - ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 - ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 - ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 - ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 - pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] - palignr m5, m0, m6, 15 - palignr m3, m0, m6, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg - pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] - pavgb m5, m0 ; A1 - Ag - - punpcklbw m0, m4, m5 ; A-B8 ... A-B1 - punpckhbw m4, m5 ; A-B9 ... A-Bg - - pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] - pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 - - pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - palignr m2, m1, m6, 14 - mova [dstq ], m2 - palignr m2, m1, m6, 12 - mova [dstq+strideq ], m2 - palignr m2, m1, m6, 10 - mova [dstq+strideq*2], m2 - palignr m2, m1, m6, 8 - mova [dstq+stride3q ], m2 - lea dstq, [dstq+strideq*4] - palignr m2, m1, m6, 6 - mova [dstq ], m2 - palignr m2, m1, m6, 4 - mova [dstq+strideq ], m2 - palignr m2, m1, m6, 2 - mova [dstq+strideq*2], m2 - pshufb m4, [GLOBAL(sh_bfedcba9876543210)] - mova [dstq+stride3q ], m6 - lea dstq, [dstq+strideq*4] - - palignr m2, m6, m4, 14 - mova [dstq ], m2 - palignr m2, m6, m4, 12 - mova [dstq+strideq ], m2 - palignr m2, m6, m4, 10 - mova [dstq+strideq*2], m2 - palignr m2, m6, m4, 8 - mova [dstq+stride3q ], m2 - lea dstq, [dstq+strideq*4] - palignr m2, m6, m4, 6 - mova [dstq ], m2 - palignr m2, m6, m4, 4 - mova [dstq+strideq ], m2 - palignr m2, m6, m4, 2 - mova [dstq+strideq*2], m2 - mova [dstq+stride3q ], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - mova m0, [leftq] - movu m7, [aboveq-1] - movu m1, [aboveq+15] - - pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] - - palignr m3, m1, m7, 1 - palignr m5, m1, m7, 2 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] - - pshufb m7, [GLOBAL(sh_bfedcba9876543210)] - palignr m5, m0, m7, 15 - palignr m3, m0, m7, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg - pavgb m5, m0 ; A1 - Ag - punpcklbw m6, m4, m5 ; A-B8 ... A-B1 - punpckhbw m4, m5 ; A-B9 ... A-Bg - pshufb m6, [GLOBAL(sh_bfedcba9876543210)] - pshufb m4, [GLOBAL(sh_bfedcba9876543210)] - - DEFINE_ARGS dst, stride, stride3, left, line - lea stride3q, [strideq*3] - - palignr m5, m2, m1, 14 - palignr m7, m1, m6, 14 - mova [dstq ], m7 - mova [dstq+16 ], m5 - palignr m5, m2, m1, 12 - palignr m7, m1, m6, 12 - mova [dstq+strideq ], m7 - mova [dstq+strideq+16 ], m5 - palignr m5, m2, m1, 10 - palignr m7, m1, m6, 10 - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m5 - palignr m5, m2, m1, 8 - palignr m7, m1, m6, 8 - mova [dstq+stride3q ], m7 - mova [dstq+stride3q+16 ], m5 - lea dstq, [dstq+strideq*4] - palignr m5, m2, m1, 6 - palignr m7, m1, m6, 6 - mova [dstq ], m7 - mova [dstq+16 ], m5 - palignr m5, m2, m1, 4 - palignr m7, m1, m6, 4 - mova [dstq+strideq ], m7 - mova [dstq+strideq+16 ], m5 - palignr m5, m2, m1, 2 - palignr m7, m1, m6, 2 - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m5 - mova [dstq+stride3q ], m6 - mova [dstq+stride3q+16 ], m1 - lea dstq, [dstq+strideq*4] - - palignr m5, m1, m6, 14 - palignr m3, m6, m4, 14 - mova [dstq ], m3 - mova [dstq+16 ], m5 - palignr m5, m1, m6, 12 - palignr m3, m6, m4, 12 - mova [dstq+strideq ], m3 - mova [dstq+strideq+16 ], m5 - palignr m5, m1, m6, 10 - palignr m3, m6, m4, 10 - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m5 - palignr m5, m1, m6, 8 - palignr m3, m6, m4, 8 - mova [dstq+stride3q ], m3 - mova [dstq+stride3q+16 ], m5 - lea dstq, [dstq+strideq*4] - palignr m5, m1, m6, 6 - palignr m3, m6, m4, 6 - mova [dstq ], m3 - mova [dstq+16 ], m5 - palignr m5, m1, m6, 4 - palignr m3, m6, m4, 4 - mova [dstq+strideq ], m3 - mova [dstq+strideq+16 ], m5 - palignr m5, m1, m6, 2 - palignr m3, m6, m4, 2 - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m5 - mova [dstq+stride3q ], m4 - mova [dstq+stride3q+16 ], m6 - lea dstq, [dstq+strideq*4] - - mova m7, [leftq] - mova m3, [leftq+16] - palignr m5, m3, m7, 15 - palignr m0, m3, m7, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - - pavgb m5, m3 ; Ah - - punpcklbw m3, m2, m5 ; A-B8 ... A-B1 - punpckhbw m2, m5 ; A-B9 ... A-Bg - pshufb m3, [GLOBAL(sh_bfedcba9876543210)] - pshufb m2, [GLOBAL(sh_bfedcba9876543210)] - - palignr m7, m6, m4, 14 - palignr m0, m4, m3, 14 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m6, m4, 12 - palignr m0, m4, m3, 12 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m6, m4, 10 - palignr m0, m4, m3, 10 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - palignr m7, m6, m4, 8 - palignr m0, m4, m3, 8 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q+16 ], m7 - lea dstq, [dstq+strideq*4] - palignr m7, m6, m4, 6 - palignr m0, m4, m3, 6 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m6, m4, 4 - palignr m0, m4, m3, 4 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m6, m4, 2 - palignr m0, m4, m3, 2 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m3 - mova [dstq+stride3q+16 ], m4 - lea dstq, [dstq+strideq*4] - - palignr m7, m4, m3, 14 - palignr m0, m3, m2, 14 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m4, m3, 12 - palignr m0, m3, m2, 12 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m4, m3, 10 - palignr m0, m3, m2, 10 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - palignr m7, m4, m3, 8 - palignr m0, m3, m2, 8 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q+16 ], m7 - lea dstq, [dstq+strideq*4] - palignr m7, m4, m3, 6 - palignr m0, m3, m2, 6 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m4, m3, 4 - palignr m0, m3, m2, 4 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m4, m3, 2 - palignr m0, m3, m2, 2 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m2 - mova [dstq+stride3q+16 ], m3 - - RESTORE_GOT - RET diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c b/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c deleted file mode 100644 index a9d6a127c..000000000 --- a/third_party/aom/aom_dsp/x86/inv_txfm_avx2.c +++ /dev/null @@ -1,1238 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/inv_txfm.h" -#include "aom_dsp/x86/inv_txfm_common_avx2.h" -#include "aom_dsp/x86/txfm_common_avx2.h" - -void aom_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i in[16]; - load_buffer_16x16(input, in); - mm256_transpose_16x16(in, in); - av1_idct16_avx2(in); - mm256_transpose_16x16(in, in); - av1_idct16_avx2(in); - store_buffer_16xN(in, stride, dest, 16); -} - -static INLINE void transpose_col_to_row_nz4x4(__m256i *in /*in[4]*/) { - const __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]); - const __m256i u1 = _mm256_unpacklo_epi16(in[2], in[3]); - const __m256i v0 = _mm256_unpacklo_epi32(u0, u1); - const __m256i v1 = _mm256_unpackhi_epi32(u0, u1); - in[0] = _mm256_permute4x64_epi64(v0, 0xA8); - in[1] = _mm256_permute4x64_epi64(v0, 0xA9); - in[2] = _mm256_permute4x64_epi64(v1, 0xA8); - in[3] = _mm256_permute4x64_epi64(v1, 0xA9); -} - -#define MM256_SHUFFLE_EPI64(x0, x1, imm8) \ - _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(x0), \ - _mm256_castsi256_pd(x1), imm8)) - -static INLINE void transpose_col_to_row_nz4x16(__m256i *in /*in[16]*/) { - int i; - for (i = 0; i < 16; i += 4) { - transpose_col_to_row_nz4x4(&in[i]); - } - - for (i = 0; i < 4; ++i) { - in[i] = MM256_SHUFFLE_EPI64(in[i], in[i + 4], 0); - in[i + 8] = MM256_SHUFFLE_EPI64(in[i + 8], in[i + 12], 0); - } - - for (i = 0; i < 4; ++i) { - in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20); - } -} - -// Coefficients 0-7 before the final butterfly -static INLINE void idct16_10_first_half(const __m256i *in, __m256i *out) { - const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m256i v4 = _mm256_mulhrs_epi16(in[2], c2p28); - const __m256i v7 = _mm256_mulhrs_epi16(in[2], c2p04); - - const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m256i v0 = _mm256_mulhrs_epi16(in[0], c2p16); - const __m256i v1 = v0; - - const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - __m256i v5, v6; - unpack_butter_fly(&v7, &v4, &cospi_p16_m16, &cospi_p16_p16, &v5, &v6); - - out[0] = _mm256_add_epi16(v0, v7); - out[1] = _mm256_add_epi16(v1, v6); - out[2] = _mm256_add_epi16(v1, v5); - out[3] = _mm256_add_epi16(v0, v4); - out[4] = _mm256_sub_epi16(v0, v4); - out[5] = _mm256_sub_epi16(v1, v5); - out[6] = _mm256_sub_epi16(v1, v6); - out[7] = _mm256_sub_epi16(v0, v7); -} - -// Coefficients 8-15 before the final butterfly -static INLINE void idct16_10_second_half(const __m256i *in, __m256i *out) { - const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30); - const __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02); - - const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - const __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26); - const __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06); - - const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - - __m256i t1, t2, t5, t6; - unpack_butter_fly(&t0, &t7, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6); - unpack_butter_fly(&t3, &t4, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5); - - out[0] = _mm256_add_epi16(t0, t3); - out[1] = _mm256_add_epi16(t1, t2); - out[6] = _mm256_add_epi16(t6, t5); - out[7] = _mm256_add_epi16(t7, t4); - - const __m256i v2 = _mm256_sub_epi16(t1, t2); - const __m256i v3 = _mm256_sub_epi16(t0, t3); - const __m256i v4 = _mm256_sub_epi16(t7, t4); - const __m256i v5 = _mm256_sub_epi16(t6, t5); - const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]); - unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]); -} - -static INLINE void add_sub_butterfly(const __m256i *in, __m256i *out, - int size) { - int i = 0; - const int num = size >> 1; - const int bound = size - 1; - while (i < num) { - out[i] = _mm256_add_epi16(in[i], in[bound - i]); - out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]); - i++; - } -} - -static INLINE void idct16_10(__m256i *in /*in[16]*/) { - __m256i out[16]; - idct16_10_first_half(in, out); - idct16_10_second_half(in, &out[8]); - add_sub_butterfly(out, in, 16); -} - -void aom_idct16x16_10_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i in[16]; - - load_coeff(input, &in[0]); - load_coeff(input + 16, &in[1]); - load_coeff(input + 32, &in[2]); - load_coeff(input + 48, &in[3]); - - transpose_col_to_row_nz4x4(in); - idct16_10(in); - - transpose_col_to_row_nz4x16(in); - idct16_10(in); - - store_buffer_16xN(in, stride, dest, 16); -} - -// Note: -// For 16x16 int16_t matrix -// transpose first 8 columns into first 8 rows. -// Since only upper-left 8x8 are non-zero, the input are first 8 rows (in[8]). -// After transposing, the 8 row vectors are in in[8]. -void transpose_col_to_row_nz8x8(__m256i *in /*in[8]*/) { - __m256i u0 = _mm256_unpacklo_epi16(in[0], in[1]); - __m256i u1 = _mm256_unpackhi_epi16(in[0], in[1]); - __m256i u2 = _mm256_unpacklo_epi16(in[2], in[3]); - __m256i u3 = _mm256_unpackhi_epi16(in[2], in[3]); - - const __m256i v0 = _mm256_unpacklo_epi32(u0, u2); - const __m256i v1 = _mm256_unpackhi_epi32(u0, u2); - const __m256i v2 = _mm256_unpacklo_epi32(u1, u3); - const __m256i v3 = _mm256_unpackhi_epi32(u1, u3); - - u0 = _mm256_unpacklo_epi16(in[4], in[5]); - u1 = _mm256_unpackhi_epi16(in[4], in[5]); - u2 = _mm256_unpacklo_epi16(in[6], in[7]); - u3 = _mm256_unpackhi_epi16(in[6], in[7]); - - const __m256i v4 = _mm256_unpacklo_epi32(u0, u2); - const __m256i v5 = _mm256_unpackhi_epi32(u0, u2); - const __m256i v6 = _mm256_unpacklo_epi32(u1, u3); - const __m256i v7 = _mm256_unpackhi_epi32(u1, u3); - - in[0] = MM256_SHUFFLE_EPI64(v0, v4, 0); - in[1] = MM256_SHUFFLE_EPI64(v0, v4, 3); - in[2] = MM256_SHUFFLE_EPI64(v1, v5, 0); - in[3] = MM256_SHUFFLE_EPI64(v1, v5, 3); - in[4] = MM256_SHUFFLE_EPI64(v2, v6, 0); - in[5] = MM256_SHUFFLE_EPI64(v2, v6, 3); - in[6] = MM256_SHUFFLE_EPI64(v3, v7, 0); - in[7] = MM256_SHUFFLE_EPI64(v3, v7, 3); -} - -// Note: -// For 16x16 int16_t matrix -// transpose first 8 columns into first 8 rows. -// Since only matrix left 8x16 are non-zero, the input are total 16 rows -// (in[16]). -// After transposing, the 8 row vectors are in in[8]. All else are zero. -static INLINE void transpose_col_to_row_nz8x16(__m256i *in /*in[16]*/) { - transpose_col_to_row_nz8x8(in); - transpose_col_to_row_nz8x8(&in[8]); - - int i; - for (i = 0; i < 8; ++i) { - in[i] = _mm256_permute2x128_si256(in[i], in[i + 8], 0x20); - } -} - -static INLINE void idct16_38_first_half(const __m256i *in, __m256i *out) { - const __m256i c2p28 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m256i c2p04 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - __m256i t4 = _mm256_mulhrs_epi16(in[2], c2p28); - __m256i t7 = _mm256_mulhrs_epi16(in[2], c2p04); - - const __m256i c2m20 = pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m256i c2p12 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - __m256i t5 = _mm256_mulhrs_epi16(in[6], c2m20); - __m256i t6 = _mm256_mulhrs_epi16(in[6], c2p12); - - const __m256i c2p16 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m256i c2p24 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m256i c2p08 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - const __m256i u0 = _mm256_mulhrs_epi16(in[0], c2p16); - const __m256i u1 = _mm256_mulhrs_epi16(in[0], c2p16); - const __m256i u2 = _mm256_mulhrs_epi16(in[4], c2p24); - const __m256i u3 = _mm256_mulhrs_epi16(in[4], c2p08); - - const __m256i u4 = _mm256_add_epi16(t4, t5); - const __m256i u5 = _mm256_sub_epi16(t4, t5); - const __m256i u6 = _mm256_sub_epi16(t7, t6); - const __m256i u7 = _mm256_add_epi16(t7, t6); - - const __m256i t0 = _mm256_add_epi16(u0, u3); - const __m256i t1 = _mm256_add_epi16(u1, u2); - const __m256i t2 = _mm256_sub_epi16(u1, u2); - const __m256i t3 = _mm256_sub_epi16(u0, u3); - - t4 = u4; - t7 = u7; - - const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6); - - out[0] = _mm256_add_epi16(t0, t7); - out[1] = _mm256_add_epi16(t1, t6); - out[2] = _mm256_add_epi16(t2, t5); - out[3] = _mm256_add_epi16(t3, t4); - out[4] = _mm256_sub_epi16(t3, t4); - out[5] = _mm256_sub_epi16(t2, t5); - out[6] = _mm256_sub_epi16(t1, t6); - out[7] = _mm256_sub_epi16(t0, t7); -} - -static INLINE void idct16_38_second_half(const __m256i *in, __m256i *out) { - const __m256i c2p30 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m256i c2p02 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - __m256i t0 = _mm256_mulhrs_epi16(in[1], c2p30); - __m256i t7 = _mm256_mulhrs_epi16(in[1], c2p02); - - const __m256i c2m18 = pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); - const __m256i c2p14 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); - __m256i t1 = _mm256_mulhrs_epi16(in[7], c2m18); - __m256i t6 = _mm256_mulhrs_epi16(in[7], c2p14); - - const __m256i c2p22 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); - const __m256i c2p10 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); - __m256i t2 = _mm256_mulhrs_epi16(in[5], c2p22); - __m256i t5 = _mm256_mulhrs_epi16(in[5], c2p10); - - const __m256i c2m26 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m256i c2p06 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - __m256i t3 = _mm256_mulhrs_epi16(in[3], c2m26); - __m256i t4 = _mm256_mulhrs_epi16(in[3], c2p06); - - __m256i v0, v1, v2, v3, v4, v5, v6, v7; - v0 = _mm256_add_epi16(t0, t1); - v1 = _mm256_sub_epi16(t0, t1); - v2 = _mm256_sub_epi16(t3, t2); - v3 = _mm256_add_epi16(t2, t3); - v4 = _mm256_add_epi16(t4, t5); - v5 = _mm256_sub_epi16(t4, t5); - v6 = _mm256_sub_epi16(t7, t6); - v7 = _mm256_add_epi16(t6, t7); - - t0 = v0; - t7 = v7; - t3 = v3; - t4 = v4; - const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6); - unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5); - - v0 = _mm256_add_epi16(t0, t3); - v1 = _mm256_add_epi16(t1, t2); - v2 = _mm256_sub_epi16(t1, t2); - v3 = _mm256_sub_epi16(t0, t3); - v4 = _mm256_sub_epi16(t7, t4); - v5 = _mm256_sub_epi16(t6, t5); - v6 = _mm256_add_epi16(t6, t5); - v7 = _mm256_add_epi16(t7, t4); - - // stage 6, (8-15) - out[0] = v0; - out[1] = v1; - out[6] = v6; - out[7] = v7; - const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64); - const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &out[2], &out[5]); - unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &out[3], &out[4]); -} - -static INLINE void idct16_38(__m256i *in /*in[16]*/) { - __m256i out[16]; - idct16_38_first_half(in, out); - idct16_38_second_half(in, &out[8]); - add_sub_butterfly(out, in, 16); -} - -void aom_idct16x16_38_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i in[16]; - - int i; - for (i = 0; i < 8; ++i) { - load_coeff(input + (i << 4), &in[i]); - } - - transpose_col_to_row_nz8x8(in); - idct16_38(in); - - transpose_col_to_row_nz8x16(in); - idct16_38(in); - - store_buffer_16xN(in, stride, dest, 16); -} - -static INLINE int calculate_dc(const tran_low_t *input) { - int dc = (int)dct_const_round_shift(input[0] * cospi_16_64); - dc = (int)dct_const_round_shift(dc * cospi_16_64); - dc = ROUND_POWER_OF_TWO(dc, IDCT_ROUNDING_POS); - return dc; -} - -void aom_idct16x16_1_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - const int dc = calculate_dc(input); - if (dc == 0) return; - - const __m256i dc_value = _mm256_set1_epi16(dc); - - int i; - for (i = 0; i < 16; ++i) { - recon_and_store(&dc_value, dest); - dest += stride; - } -} - -// ----------------------------------------------------------------------------- -// 32x32 partial IDCT - -void aom_idct32x32_1_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - const int dc = calculate_dc(input); - if (dc == 0) return; - - const __m256i dc_value = _mm256_set1_epi16(dc); - - int i; - for (i = 0; i < 32; ++i) { - recon_and_store(&dc_value, dest); - recon_and_store(&dc_value, dest + 16); - dest += stride; - } -} - -static void load_buffer_32x16(const tran_low_t *input, __m256i *in /*in[32]*/) { - int i; - for (i = 0; i < 16; ++i) { - load_coeff(input, &in[i]); - load_coeff(input + 16, &in[i + 16]); - input += 32; - } -} - -// Note: -// We extend SSSE3 operations to AVX2. Instead of operating on __m128i, we -// operate coefficients on __m256i. Our operation capacity doubles for each -// instruction. -#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ - do { \ - tmp0 = _mm256_madd_epi16(x0, co0); \ - tmp1 = _mm256_madd_epi16(x1, co0); \ - tmp2 = _mm256_madd_epi16(x0, co1); \ - tmp3 = _mm256_madd_epi16(x1, co1); \ - tmp0 = _mm256_add_epi32(tmp0, rounding); \ - tmp1 = _mm256_add_epi32(tmp1, rounding); \ - tmp2 = _mm256_add_epi32(tmp2, rounding); \ - tmp3 = _mm256_add_epi32(tmp3, rounding); \ - tmp0 = _mm256_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm256_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm256_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm256_srai_epi32(tmp3, DCT_CONST_BITS); \ - } while (0) - -static INLINE void butterfly(const __m256i *x0, const __m256i *x1, - const __m256i *c0, const __m256i *c1, __m256i *y0, - __m256i *y1) { - __m256i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm256_unpacklo_epi16(*x0, *x1); - u1 = _mm256_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *y0 = _mm256_packs_epi32(tmp0, tmp1); - *y1 = _mm256_packs_epi32(tmp2, tmp3); -} - -static INLINE void butterfly_self(__m256i *x0, __m256i *x1, const __m256i *c0, - const __m256i *c1) { - __m256i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m256i rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm256_unpacklo_epi16(*x0, *x1); - u1 = _mm256_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *x0 = _mm256_packs_epi32(tmp0, tmp1); - *x1 = _mm256_packs_epi32(tmp2, tmp3); -} - -// For each 16x32 block __m256i in[32], -// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 -// output pixels: 8-15 in __m256i in[32] -static void idct32_full_16x32_quarter_2(const __m256i *in /*in[32]*/, - __m256i *out /*out[16]*/) { - __m256i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ - __m256i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ - - { - const __m256i stg2_0 = pair256_set_epi16(cospi_30_64, -cospi_2_64); - const __m256i stg2_1 = pair256_set_epi16(cospi_2_64, cospi_30_64); - const __m256i stg2_2 = pair256_set_epi16(cospi_14_64, -cospi_18_64); - const __m256i stg2_3 = pair256_set_epi16(cospi_18_64, cospi_14_64); - butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); - butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); - } - - v8 = _mm256_add_epi16(u8, u9); - v9 = _mm256_sub_epi16(u8, u9); - v14 = _mm256_sub_epi16(u15, u14); - v15 = _mm256_add_epi16(u15, u14); - - { - const __m256i stg2_4 = pair256_set_epi16(cospi_22_64, -cospi_10_64); - const __m256i stg2_5 = pair256_set_epi16(cospi_10_64, cospi_22_64); - const __m256i stg2_6 = pair256_set_epi16(cospi_6_64, -cospi_26_64); - const __m256i stg2_7 = pair256_set_epi16(cospi_26_64, cospi_6_64); - butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); - butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); - } - - v10 = _mm256_sub_epi16(u11, u10); - v11 = _mm256_add_epi16(u11, u10); - v12 = _mm256_add_epi16(u12, u13); - v13 = _mm256_sub_epi16(u12, u13); - - { - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm256_add_epi16(v8, v11); - out[1] = _mm256_add_epi16(v9, v10); - out[6] = _mm256_add_epi16(v14, v13); - out[7] = _mm256_add_epi16(v15, v12); - - out[2] = _mm256_sub_epi16(v9, v10); - out[3] = _mm256_sub_epi16(v8, v11); - out[4] = _mm256_sub_epi16(v15, v12); - out[5] = _mm256_sub_epi16(v14, v13); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } -} - -// For each 8x32 block __m256i in[32], -// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 -// output pixels: 0-7 in __m256i in[32] -static void idct32_full_16x32_quarter_1(const __m256i *in /*in[32]*/, - __m256i *out /*out[8]*/) { - __m256i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ - __m256i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ - - { - const __m256i stg3_0 = pair256_set_epi16(cospi_28_64, -cospi_4_64); - const __m256i stg3_1 = pair256_set_epi16(cospi_4_64, cospi_28_64); - const __m256i stg3_2 = pair256_set_epi16(cospi_12_64, -cospi_20_64); - const __m256i stg3_3 = pair256_set_epi16(cospi_20_64, cospi_12_64); - butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); - butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); - } - - v4 = _mm256_add_epi16(u4, u5); - v5 = _mm256_sub_epi16(u4, u5); - v6 = _mm256_sub_epi16(u7, u6); - v7 = _mm256_add_epi16(u7, u6); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - const __m256i stg4_2 = pair256_set_epi16(cospi_24_64, -cospi_8_64); - const __m256i stg4_3 = pair256_set_epi16(cospi_8_64, cospi_24_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - - butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); - butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); - } - - v0 = _mm256_add_epi16(u0, u3); - v1 = _mm256_add_epi16(u1, u2); - v2 = _mm256_sub_epi16(u1, u2); - v3 = _mm256_sub_epi16(u0, u3); - - out[0] = _mm256_add_epi16(v0, v7); - out[1] = _mm256_add_epi16(v1, v6); - out[2] = _mm256_add_epi16(v2, v5); - out[3] = _mm256_add_epi16(v3, v4); - out[4] = _mm256_sub_epi16(v3, v4); - out[5] = _mm256_sub_epi16(v2, v5); - out[6] = _mm256_sub_epi16(v1, v6); - out[7] = _mm256_sub_epi16(v0, v7); -} - -// For each 8x32 block __m256i in[32], -// Input with odd index, -// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 -// output pixels: 16-23, 24-31 in __m256i in[32] -// We avoid hide an offset, 16, inside this function. So we output 0-15 into -// array out[16] -static void idct32_full_16x32_quarter_3_4(const __m256i *in /*in[32]*/, - __m256i *out /*out[16]*/) { - __m256i v16, v17, v18, v19, v20, v21, v22, v23; - __m256i v24, v25, v26, v27, v28, v29, v30, v31; - __m256i u16, u17, u18, u19, u20, u21, u22, u23; - __m256i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m256i stg1_0 = pair256_set_epi16(cospi_31_64, -cospi_1_64); - const __m256i stg1_1 = pair256_set_epi16(cospi_1_64, cospi_31_64); - const __m256i stg1_2 = pair256_set_epi16(cospi_15_64, -cospi_17_64); - const __m256i stg1_3 = pair256_set_epi16(cospi_17_64, cospi_15_64); - const __m256i stg1_4 = pair256_set_epi16(cospi_23_64, -cospi_9_64); - const __m256i stg1_5 = pair256_set_epi16(cospi_9_64, cospi_23_64); - const __m256i stg1_6 = pair256_set_epi16(cospi_7_64, -cospi_25_64); - const __m256i stg1_7 = pair256_set_epi16(cospi_25_64, cospi_7_64); - const __m256i stg1_8 = pair256_set_epi16(cospi_27_64, -cospi_5_64); - const __m256i stg1_9 = pair256_set_epi16(cospi_5_64, cospi_27_64); - const __m256i stg1_10 = pair256_set_epi16(cospi_11_64, -cospi_21_64); - const __m256i stg1_11 = pair256_set_epi16(cospi_21_64, cospi_11_64); - const __m256i stg1_12 = pair256_set_epi16(cospi_19_64, -cospi_13_64); - const __m256i stg1_13 = pair256_set_epi16(cospi_13_64, cospi_19_64); - const __m256i stg1_14 = pair256_set_epi16(cospi_3_64, -cospi_29_64); - const __m256i stg1_15 = pair256_set_epi16(cospi_29_64, cospi_3_64); - butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); - butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); - butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); - butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); - - butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); - butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); - - butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); - butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); - } - - v16 = _mm256_add_epi16(u16, u17); - v17 = _mm256_sub_epi16(u16, u17); - v18 = _mm256_sub_epi16(u19, u18); - v19 = _mm256_add_epi16(u19, u18); - - v20 = _mm256_add_epi16(u20, u21); - v21 = _mm256_sub_epi16(u20, u21); - v22 = _mm256_sub_epi16(u23, u22); - v23 = _mm256_add_epi16(u23, u22); - - v24 = _mm256_add_epi16(u24, u25); - v25 = _mm256_sub_epi16(u24, u25); - v26 = _mm256_sub_epi16(u27, u26); - v27 = _mm256_add_epi16(u27, u26); - - v28 = _mm256_add_epi16(u28, u29); - v29 = _mm256_sub_epi16(u28, u29); - v30 = _mm256_sub_epi16(u31, u30); - v31 = _mm256_add_epi16(u31, u30); - - { - const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64); - const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64); - const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm256_add_epi16(v16, v19); - u17 = _mm256_add_epi16(v17, v18); - u18 = _mm256_sub_epi16(v17, v18); - u19 = _mm256_sub_epi16(v16, v19); - u20 = _mm256_sub_epi16(v23, v20); - u21 = _mm256_sub_epi16(v22, v21); - u22 = _mm256_add_epi16(v22, v21); - u23 = _mm256_add_epi16(v23, v20); - - u24 = _mm256_add_epi16(v24, v27); - u25 = _mm256_add_epi16(v25, v26); - u26 = _mm256_sub_epi16(v25, v26); - u27 = _mm256_sub_epi16(v24, v27); - - u28 = _mm256_sub_epi16(v31, v28); - u29 = _mm256_sub_epi16(v30, v29); - u30 = _mm256_add_epi16(v29, v30); - u31 = _mm256_add_epi16(v28, v31); - - { - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm256_add_epi16(u16, u23); - out[1] = _mm256_add_epi16(u17, u22); - out[2] = _mm256_add_epi16(u18, u21); - out[3] = _mm256_add_epi16(u19, u20); - out[4] = _mm256_sub_epi16(u19, u20); - out[5] = _mm256_sub_epi16(u18, u21); - out[6] = _mm256_sub_epi16(u17, u22); - out[7] = _mm256_sub_epi16(u16, u23); - - out[8] = _mm256_sub_epi16(u31, u24); - out[9] = _mm256_sub_epi16(u30, u25); - out[10] = _mm256_sub_epi16(u29, u26); - out[11] = _mm256_sub_epi16(u28, u27); - out[12] = _mm256_add_epi16(u27, u28); - out[13] = _mm256_add_epi16(u26, u29); - out[14] = _mm256_add_epi16(u25, u30); - out[15] = _mm256_add_epi16(u24, u31); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); - butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); - butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); - butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); - } -} - -static void idct32_full_16x32_quarter_1_2(const __m256i *in /*in[32]*/, - __m256i *out /*out[32]*/) { - __m256i temp[16]; - idct32_full_16x32_quarter_1(in, temp); - idct32_full_16x32_quarter_2(in, &temp[8]); - add_sub_butterfly(temp, out, 16); -} - -static void idct32_16x32(const __m256i *in /*in[32]*/, - __m256i *out /*out[32]*/) { - __m256i temp[32]; - idct32_full_16x32_quarter_1_2(in, temp); - idct32_full_16x32_quarter_3_4(in, &temp[16]); - add_sub_butterfly(temp, out, 32); -} - -void aom_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i col[64], in[32]; - int i; - - for (i = 0; i < 2; ++i) { - load_buffer_32x16(input, in); - input += 32 << 4; - - mm256_transpose_16x16(in, in); - mm256_transpose_16x16(&in[16], &in[16]); - idct32_16x32(in, col + (i << 5)); - } - - for (i = 0; i < 2; ++i) { - int j = i << 4; - mm256_transpose_16x16(col + j, in); - mm256_transpose_16x16(col + j + 32, &in[16]); - idct32_16x32(in, in); - store_buffer_16xN(in, stride, dest, 32); - dest += 16; - } -} - -// Group the coefficient calculation into smaller functions -// to prevent stack spillover: -// quarter_1: 0-7 -// quarter_2: 8-15 -// quarter_3_4: 16-23, 24-31 -static void idct32_16x32_135_quarter_1(const __m256i *in /*in[16]*/, - __m256i *out /*out[8]*/) { - __m256i u0, u1, u2, u3, u4, u5, u6, u7; - __m256i v0, v1, v2, v3, v4, v5, v6, v7; - - { - const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m256i stk4_2 = pair256_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m256i stk4_3 = pair256_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - u0 = _mm256_mulhrs_epi16(in[0], stk4_0); - u2 = _mm256_mulhrs_epi16(in[8], stk4_2); - u3 = _mm256_mulhrs_epi16(in[8], stk4_3); - u1 = u0; - } - - v0 = _mm256_add_epi16(u0, u3); - v1 = _mm256_add_epi16(u1, u2); - v2 = _mm256_sub_epi16(u1, u2); - v3 = _mm256_sub_epi16(u0, u3); - - { - const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m256i stk3_2 = - pair256_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m256i stk3_3 = pair256_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - u4 = _mm256_mulhrs_epi16(in[4], stk3_0); - u7 = _mm256_mulhrs_epi16(in[4], stk3_1); - u5 = _mm256_mulhrs_epi16(in[12], stk3_2); - u6 = _mm256_mulhrs_epi16(in[12], stk3_3); - } - - v4 = _mm256_add_epi16(u4, u5); - v5 = _mm256_sub_epi16(u4, u5); - v6 = _mm256_sub_epi16(u7, u6); - v7 = _mm256_add_epi16(u7, u6); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - } - - out[0] = _mm256_add_epi16(v0, v7); - out[1] = _mm256_add_epi16(v1, v6); - out[2] = _mm256_add_epi16(v2, v5); - out[3] = _mm256_add_epi16(v3, v4); - out[4] = _mm256_sub_epi16(v3, v4); - out[5] = _mm256_sub_epi16(v2, v5); - out[6] = _mm256_sub_epi16(v1, v6); - out[7] = _mm256_sub_epi16(v0, v7); -} - -static void idct32_16x32_135_quarter_2(const __m256i *in /*in[16]*/, - __m256i *out /*out[8]*/) { - __m256i u8, u9, u10, u11, u12, u13, u14, u15; - __m256i v8, v9, v10, v11, v12, v13, v14, v15; - - { - const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m256i stk2_2 = - pair256_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); - const __m256i stk2_3 = pair256_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); - const __m256i stk2_4 = pair256_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); - const __m256i stk2_5 = pair256_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); - const __m256i stk2_6 = - pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - u8 = _mm256_mulhrs_epi16(in[2], stk2_0); - u15 = _mm256_mulhrs_epi16(in[2], stk2_1); - u9 = _mm256_mulhrs_epi16(in[14], stk2_2); - u14 = _mm256_mulhrs_epi16(in[14], stk2_3); - u10 = _mm256_mulhrs_epi16(in[10], stk2_4); - u13 = _mm256_mulhrs_epi16(in[10], stk2_5); - u11 = _mm256_mulhrs_epi16(in[6], stk2_6); - u12 = _mm256_mulhrs_epi16(in[6], stk2_7); - } - - v8 = _mm256_add_epi16(u8, u9); - v9 = _mm256_sub_epi16(u8, u9); - v10 = _mm256_sub_epi16(u11, u10); - v11 = _mm256_add_epi16(u11, u10); - v12 = _mm256_add_epi16(u12, u13); - v13 = _mm256_sub_epi16(u12, u13); - v14 = _mm256_sub_epi16(u15, u14); - v15 = _mm256_add_epi16(u15, u14); - - { - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm256_add_epi16(v8, v11); - out[1] = _mm256_add_epi16(v9, v10); - out[2] = _mm256_sub_epi16(v9, v10); - out[3] = _mm256_sub_epi16(v8, v11); - out[4] = _mm256_sub_epi16(v15, v12); - out[5] = _mm256_sub_epi16(v14, v13); - out[6] = _mm256_add_epi16(v14, v13); - out[7] = _mm256_add_epi16(v15, v12); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } -} - -// 8x32 block even indexed 8 inputs of in[16], -// output first half 16 to out[32] -static void idct32_16x32_quarter_1_2(const __m256i *in /*in[16]*/, - __m256i *out /*out[32]*/) { - __m256i temp[16]; - idct32_16x32_135_quarter_1(in, temp); - idct32_16x32_135_quarter_2(in, &temp[8]); - add_sub_butterfly(temp, out, 16); -} - -// 8x32 block odd indexed 8 inputs of in[16], -// output second half 16 to out[32] -static void idct32_16x32_quarter_3_4(const __m256i *in /*in[16]*/, - __m256i *out /*out[32]*/) { - __m256i v16, v17, v18, v19, v20, v21, v22, v23; - __m256i v24, v25, v26, v27, v28, v29, v30, v31; - __m256i u16, u17, u18, u19, u20, u21, u22, u23; - __m256i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); - const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); - const __m256i stk1_2 = - pair256_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64); - const __m256i stk1_3 = pair256_set_epi16(2 * cospi_15_64, 2 * cospi_15_64); - - const __m256i stk1_4 = pair256_set_epi16(2 * cospi_23_64, 2 * cospi_23_64); - const __m256i stk1_5 = pair256_set_epi16(2 * cospi_9_64, 2 * cospi_9_64); - const __m256i stk1_6 = - pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); - const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); - const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); - const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); - const __m256i stk1_10 = - pair256_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64); - const __m256i stk1_11 = pair256_set_epi16(2 * cospi_11_64, 2 * cospi_11_64); - - const __m256i stk1_12 = pair256_set_epi16(2 * cospi_19_64, 2 * cospi_19_64); - const __m256i stk1_13 = pair256_set_epi16(2 * cospi_13_64, 2 * cospi_13_64); - const __m256i stk1_14 = - pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); - const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); - u16 = _mm256_mulhrs_epi16(in[1], stk1_0); - u31 = _mm256_mulhrs_epi16(in[1], stk1_1); - u17 = _mm256_mulhrs_epi16(in[15], stk1_2); - u30 = _mm256_mulhrs_epi16(in[15], stk1_3); - - u18 = _mm256_mulhrs_epi16(in[9], stk1_4); - u29 = _mm256_mulhrs_epi16(in[9], stk1_5); - u19 = _mm256_mulhrs_epi16(in[7], stk1_6); - u28 = _mm256_mulhrs_epi16(in[7], stk1_7); - - u20 = _mm256_mulhrs_epi16(in[5], stk1_8); - u27 = _mm256_mulhrs_epi16(in[5], stk1_9); - u21 = _mm256_mulhrs_epi16(in[11], stk1_10); - u26 = _mm256_mulhrs_epi16(in[11], stk1_11); - - u22 = _mm256_mulhrs_epi16(in[13], stk1_12); - u25 = _mm256_mulhrs_epi16(in[13], stk1_13); - u23 = _mm256_mulhrs_epi16(in[3], stk1_14); - u24 = _mm256_mulhrs_epi16(in[3], stk1_15); - } - - v16 = _mm256_add_epi16(u16, u17); - v17 = _mm256_sub_epi16(u16, u17); - v18 = _mm256_sub_epi16(u19, u18); - v19 = _mm256_add_epi16(u19, u18); - - v20 = _mm256_add_epi16(u20, u21); - v21 = _mm256_sub_epi16(u20, u21); - v22 = _mm256_sub_epi16(u23, u22); - v23 = _mm256_add_epi16(u23, u22); - - v24 = _mm256_add_epi16(u24, u25); - v25 = _mm256_sub_epi16(u24, u25); - v26 = _mm256_sub_epi16(u27, u26); - v27 = _mm256_add_epi16(u27, u26); - - v28 = _mm256_add_epi16(u28, u29); - v29 = _mm256_sub_epi16(u28, u29); - v30 = _mm256_sub_epi16(u31, u30); - v31 = _mm256_add_epi16(u31, u30); - - { - const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64); - const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64); - const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); - - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm256_add_epi16(v16, v19); - u17 = _mm256_add_epi16(v17, v18); - u18 = _mm256_sub_epi16(v17, v18); - u19 = _mm256_sub_epi16(v16, v19); - u20 = _mm256_sub_epi16(v23, v20); - u21 = _mm256_sub_epi16(v22, v21); - u22 = _mm256_add_epi16(v22, v21); - u23 = _mm256_add_epi16(v23, v20); - - u24 = _mm256_add_epi16(v24, v27); - u25 = _mm256_add_epi16(v25, v26); - u26 = _mm256_sub_epi16(v25, v26); - u27 = _mm256_sub_epi16(v24, v27); - u28 = _mm256_sub_epi16(v31, v28); - u29 = _mm256_sub_epi16(v30, v29); - u30 = _mm256_add_epi16(v29, v30); - u31 = _mm256_add_epi16(v28, v31); - - { - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm256_add_epi16(u16, u23); - out[1] = _mm256_add_epi16(u17, u22); - out[2] = _mm256_add_epi16(u18, u21); - out[3] = _mm256_add_epi16(u19, u20); - v20 = _mm256_sub_epi16(u19, u20); - v21 = _mm256_sub_epi16(u18, u21); - v22 = _mm256_sub_epi16(u17, u22); - v23 = _mm256_sub_epi16(u16, u23); - - v24 = _mm256_sub_epi16(u31, u24); - v25 = _mm256_sub_epi16(u30, u25); - v26 = _mm256_sub_epi16(u29, u26); - v27 = _mm256_sub_epi16(u28, u27); - out[12] = _mm256_add_epi16(u27, u28); - out[13] = _mm256_add_epi16(u26, u29); - out[14] = _mm256_add_epi16(u25, u30); - out[15] = _mm256_add_epi16(u24, u31); - - { - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]); - butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]); - butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]); - butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]); - } -} - -// 16x16 block input __m256i in[32], output 16x32 __m256i in[32] -static void idct32_16x32_135(__m256i *in /*in[32]*/) { - __m256i out[32]; - idct32_16x32_quarter_1_2(in, out); - idct32_16x32_quarter_3_4(in, &out[16]); - add_sub_butterfly(out, in, 32); -} - -static INLINE void load_buffer_from_32x32(const tran_low_t *coeff, __m256i *in, - int size) { - int i = 0; - while (i < size) { - load_coeff(coeff + (i << 5), &in[i]); - i += 1; - } -} - -static INLINE void zero_buffer(__m256i *in, int num) { - int i; - for (i = 0; i < num; ++i) { - in[i] = _mm256_setzero_si256(); - } -} - -// Only upper-left 16x16 has non-zero coeff -void aom_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i in[32]; - zero_buffer(in, 32); - load_buffer_from_32x32(input, in, 16); - mm256_transpose_16x16(in, in); - idct32_16x32_135(in); - - __m256i out[32]; - mm256_transpose_16x16(in, out); - idct32_16x32_135(out); - store_buffer_16xN(out, stride, dest, 32); - mm256_transpose_16x16(&in[16], in); - idct32_16x32_135(in); - store_buffer_16xN(in, stride, dest + 16, 32); -} - -static void idct32_34_first_half(const __m256i *in, __m256i *stp1) { - const __m256i stk2_0 = pair256_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m256i stk2_1 = pair256_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m256i stk2_6 = pair256_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m256i stk2_7 = pair256_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - - const __m256i stk3_0 = pair256_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m256i stk3_1 = pair256_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stk4_0 = pair256_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m256i stg4_1 = pair256_set_epi16(cospi_16_64, -cospi_16_64); - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - __m256i u0, u1, u2, u3, u4, u5, u6, u7; - __m256i x0, x1, x4, x5, x6, x7; - __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - - // phase 1 - - // 0, 15 - u2 = _mm256_mulhrs_epi16(in[2], stk2_1); // stp2_15 - u3 = _mm256_mulhrs_epi16(in[6], stk2_7); // stp2_12 - v15 = _mm256_add_epi16(u2, u3); - // in[0], in[4] - x0 = _mm256_mulhrs_epi16(in[0], stk4_0); // stp1[0] - x7 = _mm256_mulhrs_epi16(in[4], stk3_1); // stp1[7] - v0 = _mm256_add_epi16(x0, x7); // stp2_0 - stp1[0] = _mm256_add_epi16(v0, v15); - stp1[15] = _mm256_sub_epi16(v0, v15); - - // in[2], in[6] - u0 = _mm256_mulhrs_epi16(in[2], stk2_0); // stp2_8 - u1 = _mm256_mulhrs_epi16(in[6], stk2_6); // stp2_11 - butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14 - butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13 - - v8 = _mm256_add_epi16(u0, u1); - v9 = _mm256_add_epi16(u4, u6); - v10 = _mm256_sub_epi16(u4, u6); - v11 = _mm256_sub_epi16(u0, u1); - v12 = _mm256_sub_epi16(u2, u3); - v13 = _mm256_sub_epi16(u5, u7); - v14 = _mm256_add_epi16(u5, u7); - - butterfly_self(&v10, &v13, &stg6_0, &stg4_0); - butterfly_self(&v11, &v12, &stg6_0, &stg4_0); - - // 1, 14 - x1 = _mm256_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0 - // stp1[2] = stp1[0], stp1[3] = stp1[1] - x4 = _mm256_mulhrs_epi16(in[4], stk3_0); // stp1[4] - butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6); - v1 = _mm256_add_epi16(x1, x6); // stp2_1 - v2 = _mm256_add_epi16(x0, x5); // stp2_2 - stp1[1] = _mm256_add_epi16(v1, v14); - stp1[14] = _mm256_sub_epi16(v1, v14); - - stp1[2] = _mm256_add_epi16(v2, v13); - stp1[13] = _mm256_sub_epi16(v2, v13); - - v3 = _mm256_add_epi16(x1, x4); // stp2_3 - v4 = _mm256_sub_epi16(x1, x4); // stp2_4 - - v5 = _mm256_sub_epi16(x0, x5); // stp2_5 - - v6 = _mm256_sub_epi16(x1, x6); // stp2_6 - v7 = _mm256_sub_epi16(x0, x7); // stp2_7 - stp1[3] = _mm256_add_epi16(v3, v12); - stp1[12] = _mm256_sub_epi16(v3, v12); - - stp1[6] = _mm256_add_epi16(v6, v9); - stp1[9] = _mm256_sub_epi16(v6, v9); - - stp1[7] = _mm256_add_epi16(v7, v8); - stp1[8] = _mm256_sub_epi16(v7, v8); - - stp1[4] = _mm256_add_epi16(v4, v11); - stp1[11] = _mm256_sub_epi16(v4, v11); - - stp1[5] = _mm256_add_epi16(v5, v10); - stp1[10] = _mm256_sub_epi16(v5, v10); -} - -static void idct32_34_second_half(const __m256i *in, __m256i *stp1) { - const __m256i stk1_0 = pair256_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); - const __m256i stk1_1 = pair256_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); - const __m256i stk1_6 = pair256_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); - const __m256i stk1_7 = pair256_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); - const __m256i stk1_8 = pair256_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); - const __m256i stk1_9 = pair256_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); - const __m256i stk1_14 = pair256_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); - const __m256i stk1_15 = pair256_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); - const __m256i stg3_4 = pair256_set_epi16(-cospi_4_64, cospi_28_64); - const __m256i stg3_5 = pair256_set_epi16(cospi_28_64, cospi_4_64); - const __m256i stg3_6 = pair256_set_epi16(-cospi_28_64, -cospi_4_64); - const __m256i stg3_8 = pair256_set_epi16(-cospi_20_64, cospi_12_64); - const __m256i stg3_9 = pair256_set_epi16(cospi_12_64, cospi_20_64); - const __m256i stg3_10 = pair256_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m256i stg4_0 = pair256_set_epi16(cospi_16_64, cospi_16_64); - const __m256i stg4_4 = pair256_set_epi16(-cospi_8_64, cospi_24_64); - const __m256i stg4_5 = pair256_set_epi16(cospi_24_64, cospi_8_64); - const __m256i stg4_6 = pair256_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m256i stg6_0 = pair256_set_epi16(-cospi_16_64, cospi_16_64); - __m256i v16, v17, v18, v19, v20, v21, v22, v23; - __m256i v24, v25, v26, v27, v28, v29, v30, v31; - __m256i u16, u17, u18, u19, u20, u21, u22, u23; - __m256i u24, u25, u26, u27, u28, u29, u30, u31; - - v16 = _mm256_mulhrs_epi16(in[1], stk1_0); - v31 = _mm256_mulhrs_epi16(in[1], stk1_1); - - v19 = _mm256_mulhrs_epi16(in[7], stk1_6); - v28 = _mm256_mulhrs_epi16(in[7], stk1_7); - - v20 = _mm256_mulhrs_epi16(in[5], stk1_8); - v27 = _mm256_mulhrs_epi16(in[5], stk1_9); - - v23 = _mm256_mulhrs_epi16(in[3], stk1_14); - v24 = _mm256_mulhrs_epi16(in[3], stk1_15); - - butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30); - butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29); - butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26); - butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25); - - u16 = _mm256_add_epi16(v16, v19); - u17 = _mm256_add_epi16(v17, v18); - u18 = _mm256_sub_epi16(v17, v18); - u19 = _mm256_sub_epi16(v16, v19); - u20 = _mm256_sub_epi16(v23, v20); - u21 = _mm256_sub_epi16(v22, v21); - u22 = _mm256_add_epi16(v22, v21); - u23 = _mm256_add_epi16(v23, v20); - u24 = _mm256_add_epi16(v24, v27); - u27 = _mm256_sub_epi16(v24, v27); - u25 = _mm256_add_epi16(v25, v26); - u26 = _mm256_sub_epi16(v25, v26); - u28 = _mm256_sub_epi16(v31, v28); - u31 = _mm256_add_epi16(v28, v31); - u29 = _mm256_sub_epi16(v30, v29); - u30 = _mm256_add_epi16(v29, v30); - - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - - stp1[0] = _mm256_add_epi16(u16, u23); - stp1[7] = _mm256_sub_epi16(u16, u23); - - stp1[1] = _mm256_add_epi16(u17, u22); - stp1[6] = _mm256_sub_epi16(u17, u22); - - stp1[2] = _mm256_add_epi16(u18, u21); - stp1[5] = _mm256_sub_epi16(u18, u21); - - stp1[3] = _mm256_add_epi16(u19, u20); - stp1[4] = _mm256_sub_epi16(u19, u20); - - stp1[8] = _mm256_sub_epi16(u31, u24); - stp1[15] = _mm256_add_epi16(u24, u31); - - stp1[9] = _mm256_sub_epi16(u30, u25); - stp1[14] = _mm256_add_epi16(u25, u30); - - stp1[10] = _mm256_sub_epi16(u29, u26); - stp1[13] = _mm256_add_epi16(u26, u29); - - stp1[11] = _mm256_sub_epi16(u28, u27); - stp1[12] = _mm256_add_epi16(u27, u28); - - butterfly_self(&stp1[4], &stp1[11], &stg6_0, &stg4_0); - butterfly_self(&stp1[5], &stp1[10], &stg6_0, &stg4_0); - butterfly_self(&stp1[6], &stp1[9], &stg6_0, &stg4_0); - butterfly_self(&stp1[7], &stp1[8], &stg6_0, &stg4_0); -} - -// 16x16 block input __m256i in[32], output 16x32 __m256i in[32] -static void idct32_16x32_34(__m256i *in /*in[32]*/) { - __m256i out[32]; - idct32_34_first_half(in, out); - idct32_34_second_half(in, &out[16]); - add_sub_butterfly(out, in, 32); -} - -// Only upper-left 8x8 has non-zero coeff -void aom_idct32x32_34_add_avx2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m256i in[32]; - zero_buffer(in, 32); - load_buffer_from_32x32(input, in, 8); - mm256_transpose_16x16(in, in); - idct32_16x32_34(in); - - __m256i out[32]; - mm256_transpose_16x16(in, out); - idct32_16x32_34(out); - store_buffer_16xN(out, stride, dest, 32); - mm256_transpose_16x16(&in[16], in); - idct32_16x32_34(in); - store_buffer_16xN(in, stride, dest + 16, 32); -} diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h deleted file mode 100644 index 26c5cfe59..000000000 --- a/third_party/aom/aom_dsp/x86/inv_txfm_common_avx2.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2017, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H -#define AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H - -#include - -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/txfm_common_avx2.h" - -static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) { - if (sizeof(tran_low_t) == 4) { - *in = _mm256_setr_epi16( - (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], - (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], - (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], - (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], - (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], - (int16_t)coeff[15]); - } else { - *in = _mm256_loadu_si256((const __m256i *)coeff); - } -} - -static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) { - int i = 0; - while (i < 16) { - load_coeff(coeff + (i << 4), &in[i]); - i += 1; - } -} - -static INLINE void recon_and_store(const __m256i *res, uint8_t *output) { - const __m128i zero = _mm_setzero_si128(); - __m128i x = _mm_loadu_si128((__m128i const *)output); - __m128i p0 = _mm_unpacklo_epi8(x, zero); - __m128i p1 = _mm_unpackhi_epi8(x, zero); - - p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res)); - p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1)); - x = _mm_packus_epi16(p0, p1); - _mm_storeu_si128((__m128i *)output, x); -} - -#define IDCT_ROUNDING_POS (6) -static INLINE void store_buffer_16xN(__m256i *in, const int stride, - uint8_t *output, int num) { - const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1)); - int i = 0; - - while (i < num) { - in[i] = _mm256_adds_epi16(in[i], rounding); - in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS); - recon_and_store(&in[i], output + i * stride); - i += 1; - } -} - -static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1, - const __m256i *c0, const __m256i *c1, - __m256i *b0, __m256i *b1) { - __m256i x0, x1; - x0 = _mm256_unpacklo_epi16(*a0, *a1); - x1 = _mm256_unpackhi_epi16(*a0, *a1); - *b0 = butter_fly(&x0, &x1, c0); - *b1 = butter_fly(&x0, &x1, c1); -} - -void av1_idct16_avx2(__m256i *in); - -#endif // AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c deleted file mode 100644 index 86ce928b7..000000000 --- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.c +++ /dev/null @@ -1,3500 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/x86/inv_txfm_sse2.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -#define RECON_AND_STORE4X4(dest, in_x) \ - { \ - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - *(int *)(dest) = _mm_cvtsi128_si32(d0); \ - } - -void aom_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i eight = _mm_set1_epi16(8); - const __m128i cst = _mm_setr_epi16( - (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64, - (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64, - (int16_t)cospi_8_64, (int16_t)cospi_24_64); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i input0, input1, input2, input3; - - // Rows - input0 = load_input_data(input); - input2 = load_input_data(input + 8); - - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_shufflelo_epi16(input0, 0xd8); - input0 = _mm_shufflehi_epi16(input0, 0xd8); - input2 = _mm_shufflelo_epi16(input2, 0xd8); - input2 = _mm_shufflehi_epi16(input2, 0xd8); - - input1 = _mm_unpackhi_epi32(input0, input0); - input0 = _mm_unpacklo_epi32(input0, input0); - input3 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpacklo_epi32(input2, input2); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input1); - input1 = _mm_packs_epi32(input2, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Columns - // Construct i3, i1, i3, i1, i2, i0, i2, i0 - input0 = _mm_unpacklo_epi32(input2, input2); - input1 = _mm_unpackhi_epi32(input2, input2); - input2 = _mm_unpackhi_epi32(input3, input3); - input3 = _mm_unpacklo_epi32(input3, input3); - - // Stage 1 - input0 = _mm_madd_epi16(input0, cst); - input1 = _mm_madd_epi16(input1, cst); - input2 = _mm_madd_epi16(input2, cst); - input3 = _mm_madd_epi16(input3, cst); - - input0 = _mm_add_epi32(input0, rounding); - input1 = _mm_add_epi32(input1, rounding); - input2 = _mm_add_epi32(input2, rounding); - input3 = _mm_add_epi32(input3, rounding); - - input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); - input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); - input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); - input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); - - // Stage 2 - input0 = _mm_packs_epi32(input0, input2); - input1 = _mm_packs_epi32(input1, input3); - - // Transpose - input2 = _mm_unpacklo_epi16(input0, input1); - input3 = _mm_unpackhi_epi16(input0, input1); - input0 = _mm_unpacklo_epi32(input2, input3); - input1 = _mm_unpackhi_epi32(input2, input3); - - // Switch column2, column 3, and then, we got: - // input2: column1, column 0; input3: column2, column 3. - input1 = _mm_shuffle_epi32(input1, 0x4e); - input2 = _mm_add_epi16(input0, input1); - input3 = _mm_sub_epi16(input0, input1); - - // Final round and shift - input2 = _mm_add_epi16(input2, eight); - input3 = _mm_add_epi16(input3, eight); - - input2 = _mm_srai_epi16(input2, 4); - input3 = _mm_srai_epi16(input3, 4); - - // Reconstruction and Store - { - __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); - __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); - d0 = _mm_unpacklo_epi32(d0, - _mm_cvtsi32_si128(*(const int *)(dest + stride))); - d2 = _mm_unpacklo_epi32( - _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2); - d0 = _mm_unpacklo_epi8(d0, zero); - d2 = _mm_unpacklo_epi8(d2, zero); - d0 = _mm_add_epi16(d0, input2); - d2 = _mm_add_epi16(d2, input3); - d0 = _mm_packus_epi16(d0, d2); - // store input0 - *(int *)dest = _mm_cvtsi128_si32(d0); - // store input1 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); - // store input2 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); - // store input3 - d0 = _mm_srli_si128(d0, 4); - *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); - } -} - -void aom_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 4); - - if (a == 0) return; - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE4X4(dest + 0 * stride, dc_value); - RECON_AND_STORE4X4(dest + 1 * stride, dc_value); - RECON_AND_STORE4X4(dest + 2 * stride, dc_value); - RECON_AND_STORE4X4(dest + 3 * stride, dc_value); -} - -void aom_idct4_sse2(__m128i *in) { - const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; - - array_transpose_4x4(in); - // stage 1 - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[3], v[2]); - - // stage 2 - in[0] = _mm_add_epi16(u[0], u[1]); - in[1] = _mm_sub_epi16(u[0], u[1]); - in[1] = _mm_shuffle_epi32(in[1], 0x4E); -} - -void aom_iadst4_sse2(__m128i *in) { - const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); - const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); - const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8], in7; - - array_transpose_4x4(in); - in7 = _mm_srli_si128(in[1], 8); - in7 = _mm_add_epi16(in7, in[0]); - in7 = _mm_sub_epi16(in7, in[1]); - - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpackhi_epi16(in[0], in[1]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpackhi_epi16(in[0], kZero); - - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 - v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 - - u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_add_epi32(v[3], v[4]); - u[2] = v[2]; - u[3] = _mm_add_epi32(u[0], u[1]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_add_epi32(u[3], v[5]); - u[6] = _mm_sub_epi32(u[5], u[4]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[1] = _mm_packs_epi32(u[2], u[3]); -} - -// Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ - res0, res1, res2, res3) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - tmp4 = _mm_madd_epi16(lo_1, cst2); \ - tmp5 = _mm_madd_epi16(hi_1, cst2); \ - tmp6 = _mm_madd_epi16(lo_1, cst3); \ - tmp7 = _mm_madd_epi16(hi_1, cst3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - res2 = _mm_packs_epi32(tmp4, tmp5); \ - res3 = _mm_packs_epi32(tmp6, tmp7); \ - } - -#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - } - -#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \ - out4, out5, out6, out7) \ - { \ - /* Stage1 */ \ - { \ - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ - \ - MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1, \ - stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ - const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ - \ - MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ - tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ - tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ - tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - } \ - \ - /* Stage4 */ \ - out0 = _mm_adds_epi16(stp1_0, stp2_7); \ - out1 = _mm_adds_epi16(stp1_1, stp1_6); \ - out2 = _mm_adds_epi16(stp1_2, stp1_5); \ - out3 = _mm_adds_epi16(stp1_3, stp2_4); \ - out4 = _mm_subs_epi16(stp1_3, stp2_4); \ - out5 = _mm_subs_epi16(stp1_2, stp1_5); \ - out6 = _mm_subs_epi16(stp1_1, stp1_6); \ - out7 = _mm_subs_epi16(stp1_0, stp2_7); \ - } - -void aom_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); - - // 2-D - for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from aom_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, - in6, in7); - } - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -void aom_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 5); - - if (a == 0) return; - - dc_value = _mm_set1_epi16(a); - - RECON_AND_STORE(dest + 0 * stride, dc_value); - RECON_AND_STORE(dest + 1 * stride, dc_value); - RECON_AND_STORE(dest + 2 * stride, dc_value); - RECON_AND_STORE(dest + 3 * stride, dc_value); - RECON_AND_STORE(dest + 4 * stride, dc_value); - RECON_AND_STORE(dest + 5 * stride, dc_value); - RECON_AND_STORE(dest + 6 * stride, dc_value); - RECON_AND_STORE(dest + 7 * stride, dc_value); -} - -void aom_idct8_sse2(__m128i *in) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // 8x8 Transpose is copied from aom_fdct8x8_sse2() - TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0, - in1, in2, in3, in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3], - in[4], in[5], in[6], in[7]); -} - -void aom_iadst8_sse2(__m128i *in) { - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - - // transpose - array_transpose_8x8(in, in); - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - - // column transformation - // stage 1 - // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); - - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); - - // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); - - // shift and rounding - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); - - // back to 16-bit and pack 8 integers into __m128i - in[0] = _mm_packs_epi32(u0, u1); - in[1] = _mm_packs_epi32(u2, u3); - in[2] = _mm_packs_epi32(u4, u5); - in[3] = _mm_packs_epi32(u6, u7); - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); - - // stage 2 - s0 = _mm_add_epi16(in[0], in[2]); - s1 = _mm_add_epi16(in[1], in[3]); - s2 = _mm_sub_epi16(in[0], in[2]); - s3 = _mm_sub_epi16(in[1], in[3]); - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); - - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); - - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); - - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); - - // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); - - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); -} - -void aom_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - // Stage1 - { - const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero); - const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - stp1_5 = _mm_packs_epi32(tmp4, tmp6); - } - - // Stage2 - { - const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero); - const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero); - - tmp0 = _mm_madd_epi16(lo_04, stg2_0); - tmp2 = _mm_madd_epi16(lo_04, stg2_1); - tmp4 = _mm_madd_epi16(lo_26, stg2_2); - tmp6 = _mm_madd_epi16(lo_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp0, tmp2); - stp2_2 = _mm_packs_epi32(tmp6, tmp4); - - tmp0 = _mm_adds_epi16(stp1_4, stp1_5); - tmp1 = _mm_subs_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage3 - { - const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - - tmp4 = _mm_adds_epi16(stp2_0, stp2_2); - tmp6 = _mm_subs_epi16(stp2_0, stp2_2); - - stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); - stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); - - tmp0 = _mm_madd_epi16(lo_56, stg3_0); - tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - tmp0 = _mm_adds_epi16(stp1_3, stp2_4); - tmp1 = _mm_adds_epi16(stp1_2, stp1_5); - tmp2 = _mm_subs_epi16(stp1_3, stp2_4); - tmp3 = _mm_subs_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4, - in5, in6, in7); - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -#define IDCT16 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \ - const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]); \ - const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]); \ - const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \ - const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1, \ - stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) \ - \ - MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \ - stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \ - const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \ - const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \ - stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) \ - \ - stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - \ - stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \ - const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \ - const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } - -#define IDCT16_10 \ - /* Stage2 */ \ - { \ - const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero); \ - const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \ - stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11, \ - stp1_12_0) \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \ - \ - stp1_9 = stp1_8_0; \ - stp1_10 = stp1_11; \ - \ - stp1_13 = stp1_12_0; \ - stp1_14 = stp1_15; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1) \ - stp2_5 = stp2_4; \ - stp2_6 = stp2_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - \ - stp1_2 = stp1_1; \ - stp1_3 = stp1_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ - \ - stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - } - -void aom_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[16], l[16], r[16], *curr1; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_8_0, stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - curr1 = l; - for (i = 0; i < 2; i++) { - // 1-D idct - - // Load input data. - in[0] = load_input_data(input); - in[8] = load_input_data(input + 8 * 1); - in[1] = load_input_data(input + 8 * 2); - in[9] = load_input_data(input + 8 * 3); - in[2] = load_input_data(input + 8 * 4); - in[10] = load_input_data(input + 8 * 5); - in[3] = load_input_data(input + 8 * 6); - in[11] = load_input_data(input + 8 * 7); - in[4] = load_input_data(input + 8 * 8); - in[12] = load_input_data(input + 8 * 9); - in[5] = load_input_data(input + 8 * 10); - in[13] = load_input_data(input + 8 * 11); - in[6] = load_input_data(input + 8 * 12); - in[14] = load_input_data(input + 8 * 13); - in[7] = load_input_data(input + 8 * 14); - in[15] = load_input_data(input + 8 * 15); - - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - - IDCT16 - - // Stage7 - curr1[0] = _mm_add_epi16(stp2_0, stp1_15); - curr1[1] = _mm_add_epi16(stp2_1, stp1_14); - curr1[2] = _mm_add_epi16(stp2_2, stp2_13); - curr1[3] = _mm_add_epi16(stp2_3, stp2_12); - curr1[4] = _mm_add_epi16(stp2_4, stp2_11); - curr1[5] = _mm_add_epi16(stp2_5, stp2_10); - curr1[6] = _mm_add_epi16(stp2_6, stp1_9); - curr1[7] = _mm_add_epi16(stp2_7, stp1_8); - curr1[8] = _mm_sub_epi16(stp2_7, stp1_8); - curr1[9] = _mm_sub_epi16(stp2_6, stp1_9); - curr1[10] = _mm_sub_epi16(stp2_5, stp2_10); - curr1[11] = _mm_sub_epi16(stp2_4, stp2_11); - curr1[12] = _mm_sub_epi16(stp2_3, stp2_12); - curr1[13] = _mm_sub_epi16(stp2_2, stp2_13); - curr1[14] = _mm_sub_epi16(stp2_1, stp1_14); - curr1[15] = _mm_sub_epi16(stp2_0, stp1_15); - - curr1 = r; - input += 128; - } - for (i = 0; i < 2; i++) { - int j; - // 1-D idct - array_transpose_8x8(l + i * 8, in); - array_transpose_8x8(r + i * 8, in + 8); - - IDCT16 - - // 2-D - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, i; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - if (a == 0) return; - - dc_value = _mm_set1_epi16(a); - - for (i = 0; i < 16; ++i) { - RECON_AND_STORE(dest + 0, dc_value); - RECON_AND_STORE(dest + 8, dc_value); - dest += stride; - } -} - -void iadst16_8col(__m128i *in) { - // perform 16x16 1-D ADST for 8 columns - __m128i s[16], x[16], u[32], v[32]; - const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); - const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); - const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - - u[0] = _mm_unpacklo_epi16(in[15], in[0]); - u[1] = _mm_unpackhi_epi16(in[15], in[0]); - u[2] = _mm_unpacklo_epi16(in[13], in[2]); - u[3] = _mm_unpackhi_epi16(in[13], in[2]); - u[4] = _mm_unpacklo_epi16(in[11], in[4]); - u[5] = _mm_unpackhi_epi16(in[11], in[4]); - u[6] = _mm_unpacklo_epi16(in[9], in[6]); - u[7] = _mm_unpackhi_epi16(in[9], in[6]); - u[8] = _mm_unpacklo_epi16(in[7], in[8]); - u[9] = _mm_unpackhi_epi16(in[7], in[8]); - u[10] = _mm_unpacklo_epi16(in[5], in[10]); - u[11] = _mm_unpackhi_epi16(in[5], in[10]); - u[12] = _mm_unpacklo_epi16(in[3], in[12]); - u[13] = _mm_unpackhi_epi16(in[3], in[12]); - u[14] = _mm_unpacklo_epi16(in[1], in[14]); - u[15] = _mm_unpackhi_epi16(in[1], in[14]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); - v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); - v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); - v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); - v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); - v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); - v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); - v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); - v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); - v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); - v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); - v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); - v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); - v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); - v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); - v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); - v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); - v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); - v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); - v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); - v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); - v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); - v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); - v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); - v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); - v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); - v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); - v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); - v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); - v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); - v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); - v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); - - u[0] = _mm_add_epi32(v[0], v[16]); - u[1] = _mm_add_epi32(v[1], v[17]); - u[2] = _mm_add_epi32(v[2], v[18]); - u[3] = _mm_add_epi32(v[3], v[19]); - u[4] = _mm_add_epi32(v[4], v[20]); - u[5] = _mm_add_epi32(v[5], v[21]); - u[6] = _mm_add_epi32(v[6], v[22]); - u[7] = _mm_add_epi32(v[7], v[23]); - u[8] = _mm_add_epi32(v[8], v[24]); - u[9] = _mm_add_epi32(v[9], v[25]); - u[10] = _mm_add_epi32(v[10], v[26]); - u[11] = _mm_add_epi32(v[11], v[27]); - u[12] = _mm_add_epi32(v[12], v[28]); - u[13] = _mm_add_epi32(v[13], v[29]); - u[14] = _mm_add_epi32(v[14], v[30]); - u[15] = _mm_add_epi32(v[15], v[31]); - u[16] = _mm_sub_epi32(v[0], v[16]); - u[17] = _mm_sub_epi32(v[1], v[17]); - u[18] = _mm_sub_epi32(v[2], v[18]); - u[19] = _mm_sub_epi32(v[3], v[19]); - u[20] = _mm_sub_epi32(v[4], v[20]); - u[21] = _mm_sub_epi32(v[5], v[21]); - u[22] = _mm_sub_epi32(v[6], v[22]); - u[23] = _mm_sub_epi32(v[7], v[23]); - u[24] = _mm_sub_epi32(v[8], v[24]); - u[25] = _mm_sub_epi32(v[9], v[25]); - u[26] = _mm_sub_epi32(v[10], v[26]); - u[27] = _mm_sub_epi32(v[11], v[27]); - u[28] = _mm_sub_epi32(v[12], v[28]); - u[29] = _mm_sub_epi32(v[13], v[29]); - u[30] = _mm_sub_epi32(v[14], v[30]); - u[31] = _mm_sub_epi32(v[15], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_packs_epi32(u[8], u[9]); - s[5] = _mm_packs_epi32(u[10], u[11]); - s[6] = _mm_packs_epi32(u[12], u[13]); - s[7] = _mm_packs_epi32(u[14], u[15]); - s[8] = _mm_packs_epi32(u[16], u[17]); - s[9] = _mm_packs_epi32(u[18], u[19]); - s[10] = _mm_packs_epi32(u[20], u[21]); - s[11] = _mm_packs_epi32(u[22], u[23]); - s[12] = _mm_packs_epi32(u[24], u[25]); - s[13] = _mm_packs_epi32(u[26], u[27]); - s[14] = _mm_packs_epi32(u[28], u[29]); - s[15] = _mm_packs_epi32(u[30], u[31]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[9]); - u[1] = _mm_unpackhi_epi16(s[8], s[9]); - u[2] = _mm_unpacklo_epi16(s[10], s[11]); - u[3] = _mm_unpackhi_epi16(s[10], s[11]); - u[4] = _mm_unpacklo_epi16(s[12], s[13]); - u[5] = _mm_unpackhi_epi16(s[12], s[13]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); - v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); - v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); - v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); - v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); - v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); - v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); - v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[15] = _mm_sub_epi32(v[7], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - x[0] = _mm_add_epi16(s[0], s[4]); - x[1] = _mm_add_epi16(s[1], s[5]); - x[2] = _mm_add_epi16(s[2], s[6]); - x[3] = _mm_add_epi16(s[3], s[7]); - x[4] = _mm_sub_epi16(s[0], s[4]); - x[5] = _mm_sub_epi16(s[1], s[5]); - x[6] = _mm_sub_epi16(s[2], s[6]); - x[7] = _mm_sub_epi16(s[3], s[7]); - x[8] = _mm_packs_epi32(u[0], u[1]); - x[9] = _mm_packs_epi32(u[2], u[3]); - x[10] = _mm_packs_epi32(u[4], u[5]); - x[11] = _mm_packs_epi32(u[6], u[7]); - x[12] = _mm_packs_epi32(u[8], u[9]); - x[13] = _mm_packs_epi32(u[10], u[11]); - x[14] = _mm_packs_epi32(u[12], u[13]); - x[15] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - u[0] = _mm_unpacklo_epi16(x[4], x[5]); - u[1] = _mm_unpackhi_epi16(x[4], x[5]); - u[2] = _mm_unpacklo_epi16(x[6], x[7]); - u[3] = _mm_unpackhi_epi16(x[6], x[7]); - u[4] = _mm_unpacklo_epi16(x[12], x[13]); - u[5] = _mm_unpackhi_epi16(x[12], x[13]); - u[6] = _mm_unpacklo_epi16(x[14], x[15]); - u[7] = _mm_unpackhi_epi16(x[14], x[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); - v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); - v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], v[4]); - u[1] = _mm_add_epi32(v[1], v[5]); - u[2] = _mm_add_epi32(v[2], v[6]); - u[3] = _mm_add_epi32(v[3], v[7]); - u[4] = _mm_sub_epi32(v[0], v[4]); - u[5] = _mm_sub_epi32(v[1], v[5]); - u[6] = _mm_sub_epi32(v[2], v[6]); - u[7] = _mm_sub_epi32(v[3], v[7]); - u[8] = _mm_add_epi32(v[8], v[12]); - u[9] = _mm_add_epi32(v[9], v[13]); - u[10] = _mm_add_epi32(v[10], v[14]); - u[11] = _mm_add_epi32(v[11], v[15]); - u[12] = _mm_sub_epi32(v[8], v[12]); - u[13] = _mm_sub_epi32(v[9], v[13]); - u[14] = _mm_sub_epi32(v[10], v[14]); - u[15] = _mm_sub_epi32(v[11], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_add_epi16(x[0], x[2]); - s[1] = _mm_add_epi16(x[1], x[3]); - s[2] = _mm_sub_epi16(x[0], x[2]); - s[3] = _mm_sub_epi16(x[1], x[3]); - s[4] = _mm_packs_epi32(v[0], v[1]); - s[5] = _mm_packs_epi32(v[2], v[3]); - s[6] = _mm_packs_epi32(v[4], v[5]); - s[7] = _mm_packs_epi32(v[6], v[7]); - s[8] = _mm_add_epi16(x[8], x[10]); - s[9] = _mm_add_epi16(x[9], x[11]); - s[10] = _mm_sub_epi16(x[8], x[10]); - s[11] = _mm_sub_epi16(x[9], x[11]); - s[12] = _mm_packs_epi32(v[8], v[9]); - s[13] = _mm_packs_epi32(v[10], v[11]); - s[14] = _mm_packs_epi32(v[12], v[13]); - s[15] = _mm_packs_epi32(v[14], v[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(s[2], s[3]); - u[1] = _mm_unpackhi_epi16(s[2], s[3]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(s[10], s[11]); - u[5] = _mm_unpackhi_epi16(s[10], s[11]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[0] = s[0]; - in[1] = _mm_sub_epi16(kZero, s[8]); - in[2] = s[12]; - in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); - in[12] = s[5]; - in[13] = _mm_sub_epi16(kZero, s[13]); - in[14] = s[9]; - in[15] = _mm_sub_epi16(kZero, s[1]); -} - -void idct16_8col(__m128i *in) { - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i v[16], u[16], s[16], t[16]; - - // stage 1 - s[0] = in[0]; - s[1] = in[8]; - s[2] = in[4]; - s[3] = in[12]; - s[4] = in[2]; - s[5] = in[10]; - s[6] = in[6]; - s[7] = in[14]; - s[8] = in[1]; - s[9] = in[9]; - s[10] = in[5]; - s[11] = in[13]; - s[12] = in[3]; - s[13] = in[11]; - s[14] = in[7]; - s[15] = in[15]; - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[15]); - u[1] = _mm_unpackhi_epi16(s[8], s[15]); - u[2] = _mm_unpacklo_epi16(s[9], s[14]); - u[3] = _mm_unpackhi_epi16(s[9], s[14]); - u[4] = _mm_unpacklo_epi16(s[10], s[13]); - u[5] = _mm_unpackhi_epi16(s[10], s[13]); - u[6] = _mm_unpacklo_epi16(s[11], s[12]); - u[7] = _mm_unpackhi_epi16(s[11], s[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); - v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); - v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); - v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); - v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); - v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); - v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); - v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); - v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); - v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); - v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); - v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); - v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); - v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[8] = _mm_packs_epi32(u[0], u[1]); - s[15] = _mm_packs_epi32(u[2], u[3]); - s[9] = _mm_packs_epi32(u[4], u[5]); - s[14] = _mm_packs_epi32(u[6], u[7]); - s[10] = _mm_packs_epi32(u[8], u[9]); - s[13] = _mm_packs_epi32(u[10], u[11]); - s[11] = _mm_packs_epi32(u[12], u[13]); - s[12] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - t[0] = s[0]; - t[1] = s[1]; - t[2] = s[2]; - t[3] = s[3]; - u[0] = _mm_unpacklo_epi16(s[4], s[7]); - u[1] = _mm_unpackhi_epi16(s[4], s[7]); - u[2] = _mm_unpacklo_epi16(s[5], s[6]); - u[3] = _mm_unpackhi_epi16(s[5], s[6]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[4] = _mm_packs_epi32(u[0], u[1]); - t[7] = _mm_packs_epi32(u[2], u[3]); - t[5] = _mm_packs_epi32(u[4], u[5]); - t[6] = _mm_packs_epi32(u[6], u[7]); - t[8] = _mm_add_epi16(s[8], s[9]); - t[9] = _mm_sub_epi16(s[8], s[9]); - t[10] = _mm_sub_epi16(s[11], s[10]); - t[11] = _mm_add_epi16(s[10], s[11]); - t[12] = _mm_add_epi16(s[12], s[13]); - t[13] = _mm_sub_epi16(s[12], s[13]); - t[14] = _mm_sub_epi16(s[15], s[14]); - t[15] = _mm_add_epi16(s[14], s[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(t[0], t[1]); - u[1] = _mm_unpackhi_epi16(t[0], t[1]); - u[2] = _mm_unpacklo_epi16(t[2], t[3]); - u[3] = _mm_unpackhi_epi16(t[2], t[3]); - u[4] = _mm_unpacklo_epi16(t[9], t[14]); - u[5] = _mm_unpackhi_epi16(t[9], t[14]); - u[6] = _mm_unpacklo_epi16(t[10], t[13]); - u[7] = _mm_unpackhi_epi16(t[10], t[13]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); - v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); - v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_add_epi16(t[4], t[5]); - s[5] = _mm_sub_epi16(t[4], t[5]); - s[6] = _mm_sub_epi16(t[7], t[6]); - s[7] = _mm_add_epi16(t[6], t[7]); - s[8] = t[8]; - s[15] = t[15]; - s[9] = _mm_packs_epi32(u[8], u[9]); - s[14] = _mm_packs_epi32(u[10], u[11]); - s[10] = _mm_packs_epi32(u[12], u[13]); - s[13] = _mm_packs_epi32(u[14], u[15]); - s[11] = t[11]; - s[12] = t[12]; - - // stage 5 - t[0] = _mm_add_epi16(s[0], s[3]); - t[1] = _mm_add_epi16(s[1], s[2]); - t[2] = _mm_sub_epi16(s[1], s[2]); - t[3] = _mm_sub_epi16(s[0], s[3]); - t[4] = s[4]; - t[7] = s[7]; - - u[0] = _mm_unpacklo_epi16(s[5], s[6]); - u[1] = _mm_unpackhi_epi16(s[5], s[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - t[5] = _mm_packs_epi32(u[0], u[1]); - t[6] = _mm_packs_epi32(u[2], u[3]); - - t[8] = _mm_add_epi16(s[8], s[11]); - t[9] = _mm_add_epi16(s[9], s[10]); - t[10] = _mm_sub_epi16(s[9], s[10]); - t[11] = _mm_sub_epi16(s[8], s[11]); - t[12] = _mm_sub_epi16(s[15], s[12]); - t[13] = _mm_sub_epi16(s[14], s[13]); - t[14] = _mm_add_epi16(s[13], s[14]); - t[15] = _mm_add_epi16(s[12], s[15]); - - // stage 6 - s[0] = _mm_add_epi16(t[0], t[7]); - s[1] = _mm_add_epi16(t[1], t[6]); - s[2] = _mm_add_epi16(t[2], t[5]); - s[3] = _mm_add_epi16(t[3], t[4]); - s[4] = _mm_sub_epi16(t[3], t[4]); - s[5] = _mm_sub_epi16(t[2], t[5]); - s[6] = _mm_sub_epi16(t[1], t[6]); - s[7] = _mm_sub_epi16(t[0], t[7]); - s[8] = t[8]; - s[9] = t[9]; - - u[0] = _mm_unpacklo_epi16(t[10], t[13]); - u[1] = _mm_unpackhi_epi16(t[10], t[13]); - u[2] = _mm_unpacklo_epi16(t[11], t[12]); - u[3] = _mm_unpackhi_epi16(t[11], t[12]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - s[10] = _mm_packs_epi32(u[0], u[1]); - s[13] = _mm_packs_epi32(u[2], u[3]); - s[11] = _mm_packs_epi32(u[4], u[5]); - s[12] = _mm_packs_epi32(u[6], u[7]); - s[14] = t[14]; - s[15] = t[15]; - - // stage 7 - in[0] = _mm_add_epi16(s[0], s[15]); - in[1] = _mm_add_epi16(s[1], s[14]); - in[2] = _mm_add_epi16(s[2], s[13]); - in[3] = _mm_add_epi16(s[3], s[12]); - in[4] = _mm_add_epi16(s[4], s[11]); - in[5] = _mm_add_epi16(s[5], s[10]); - in[6] = _mm_add_epi16(s[6], s[9]); - in[7] = _mm_add_epi16(s[7], s[8]); - in[8] = _mm_sub_epi16(s[7], s[8]); - in[9] = _mm_sub_epi16(s[6], s[9]); - in[10] = _mm_sub_epi16(s[5], s[10]); - in[11] = _mm_sub_epi16(s[4], s[11]); - in[12] = _mm_sub_epi16(s[3], s[12]); - in[13] = _mm_sub_epi16(s[2], s[13]); - in[14] = _mm_sub_epi16(s[1], s[14]); - in[15] = _mm_sub_epi16(s[0], s[15]); -} - -void aom_idct16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - idct16_8col(in0); - idct16_8col(in1); -} - -void aom_iadst16_sse2(__m128i *in0, __m128i *in1) { - array_transpose_16x16(in0, in1); - iadst16_8col(in0); - iadst16_8col(in1); -} - -void aom_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i in[16], l[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8, - stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0, - stp1_12_0; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - // First 1-D inverse DCT - // Load input data. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 8 * 2); - in[2] = load_input_data(input + 8 * 4); - in[3] = load_input_data(input + 8 * 6); - - TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]); - - // Stage2 - { - const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero); - const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]); - - tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); - tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); - tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); - tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp2_8 = _mm_packs_epi32(tmp0, tmp2); - stp2_11 = _mm_packs_epi32(tmp5, tmp7); - } - - // Stage3 - { - const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero); - - tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); - tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_13 = _mm_unpackhi_epi64(stp2_11, zero); - stp1_14 = _mm_unpackhi_epi64(stp2_8, zero); - - stp1_4 = _mm_packs_epi32(tmp0, tmp2); - } - - // Stage4 - { - const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero); - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13); - - tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); - tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); - tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); - tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); - tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); - tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); - - stp1_0 = _mm_packs_epi32(tmp0, tmp0); - stp1_1 = _mm_packs_epi32(tmp2, tmp2); - stp2_9 = _mm_packs_epi32(tmp1, tmp3); - stp2_10 = _mm_packs_epi32(tmp5, tmp7); - - stp2_6 = _mm_unpackhi_epi64(stp1_4, zero); - } - - // Stage5 and Stage6 - { - tmp0 = _mm_add_epi16(stp2_8, stp2_11); - tmp1 = _mm_sub_epi16(stp2_8, stp2_11); - tmp2 = _mm_add_epi16(stp2_9, stp2_10); - tmp3 = _mm_sub_epi16(stp2_9, stp2_10); - - stp1_9 = _mm_unpacklo_epi64(tmp2, zero); - stp1_10 = _mm_unpacklo_epi64(tmp3, zero); - stp1_8 = _mm_unpacklo_epi64(tmp0, zero); - stp1_11 = _mm_unpacklo_epi64(tmp1, zero); - - stp1_13 = _mm_unpackhi_epi64(tmp3, zero); - stp1_14 = _mm_unpackhi_epi64(tmp2, zero); - stp1_12 = _mm_unpackhi_epi64(tmp1, zero); - stp1_15 = _mm_unpackhi_epi64(tmp0, zero); - } - - // Stage6 - { - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4); - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); - - tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); - tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); - tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); - tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); - tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); - tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); - - stp1_6 = _mm_packs_epi32(tmp3, tmp1); - - stp2_10 = _mm_packs_epi32(tmp0, zero); - stp2_13 = _mm_packs_epi32(tmp2, zero); - stp2_11 = _mm_packs_epi32(tmp4, zero); - stp2_12 = _mm_packs_epi32(tmp6, zero); - - tmp0 = _mm_add_epi16(stp1_0, stp1_4); - tmp1 = _mm_sub_epi16(stp1_0, stp1_4); - tmp2 = _mm_add_epi16(stp1_1, stp1_6); - tmp3 = _mm_sub_epi16(stp1_1, stp1_6); - - stp2_0 = _mm_unpackhi_epi64(tmp0, zero); - stp2_1 = _mm_unpacklo_epi64(tmp2, zero); - stp2_2 = _mm_unpackhi_epi64(tmp2, zero); - stp2_3 = _mm_unpacklo_epi64(tmp0, zero); - stp2_4 = _mm_unpacklo_epi64(tmp1, zero); - stp2_5 = _mm_unpackhi_epi64(tmp3, zero); - stp2_6 = _mm_unpacklo_epi64(tmp3, zero); - stp2_7 = _mm_unpackhi_epi64(tmp1, zero); - } - - // Stage7. Left 8x16 only. - l[0] = _mm_add_epi16(stp2_0, stp1_15); - l[1] = _mm_add_epi16(stp2_1, stp1_14); - l[2] = _mm_add_epi16(stp2_2, stp2_13); - l[3] = _mm_add_epi16(stp2_3, stp2_12); - l[4] = _mm_add_epi16(stp2_4, stp2_11); - l[5] = _mm_add_epi16(stp2_5, stp2_10); - l[6] = _mm_add_epi16(stp2_6, stp1_9); - l[7] = _mm_add_epi16(stp2_7, stp1_8); - l[8] = _mm_sub_epi16(stp2_7, stp1_8); - l[9] = _mm_sub_epi16(stp2_6, stp1_9); - l[10] = _mm_sub_epi16(stp2_5, stp2_10); - l[11] = _mm_sub_epi16(stp2_4, stp2_11); - l[12] = _mm_sub_epi16(stp2_3, stp2_12); - l[13] = _mm_sub_epi16(stp2_2, stp2_13); - l[14] = _mm_sub_epi16(stp2_1, stp1_14); - l[15] = _mm_sub_epi16(stp2_0, stp1_15); - - // Second 1-D inverse transform, performed per 8x16 block - for (i = 0; i < 2; i++) { - int j; - array_transpose_4X8(l + 8 * i, in); - - IDCT16_10 - - // Stage7 - in[0] = _mm_add_epi16(stp2_0, stp1_15); - in[1] = _mm_add_epi16(stp2_1, stp1_14); - in[2] = _mm_add_epi16(stp2_2, stp2_13); - in[3] = _mm_add_epi16(stp2_3, stp2_12); - in[4] = _mm_add_epi16(stp2_4, stp2_11); - in[5] = _mm_add_epi16(stp2_5, stp2_10); - in[6] = _mm_add_epi16(stp2_6, stp1_9); - in[7] = _mm_add_epi16(stp2_7, stp1_8); - in[8] = _mm_sub_epi16(stp2_7, stp1_8); - in[9] = _mm_sub_epi16(stp2_6, stp1_9); - in[10] = _mm_sub_epi16(stp2_5, stp2_10); - in[11] = _mm_sub_epi16(stp2_4, stp2_11); - in[12] = _mm_sub_epi16(stp2_3, stp2_12); - in[13] = _mm_sub_epi16(stp2_2, stp2_13); - in[14] = _mm_sub_epi16(stp2_1, stp1_14); - in[15] = _mm_sub_epi16(stp2_0, stp1_15); - - for (j = 0; j < 16; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -#define LOAD_DQCOEFF(reg, input) \ - { \ - reg = load_input_data(input); \ - input += 8; \ - } - -#define IDCT32_34 \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ - \ - const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \ - \ - const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16, \ - stp1_31); \ - MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19, \ - stp1_28); \ - MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20, \ - stp1_27); \ - MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23, \ - stp1_24); \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ - \ - const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \ - \ - MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8, \ - stp2_15); \ - MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11, \ - stp2_12); \ - \ - stp2_16 = stp1_16; \ - stp2_19 = stp1_19; \ - \ - stp2_20 = stp1_20; \ - stp2_23 = stp1_23; \ - \ - stp2_24 = stp1_24; \ - stp2_27 = stp1_27; \ - \ - stp2_28 = stp1_28; \ - stp2_31 = stp1_31; \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \ - \ - MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4, \ - stp1_7); \ - \ - stp1_8 = stp2_8; \ - stp1_11 = stp2_11; \ - stp1_12 = stp2_12; \ - stp1_15 = stp2_15; \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \ - \ - MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0, \ - stp2_1); \ - \ - stp2_4 = stp1_4; \ - stp2_5 = stp1_4; \ - stp2_6 = stp1_7; \ - stp2_7 = stp1_7; \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = stp2_0; \ - stp1_1 = stp2_1; \ - stp1_2 = stp2_1; \ - stp1_3 = stp2_0; \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } - -#define IDCT32(in0, in1) \ - /* Stage1 */ \ - { \ - const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]); \ - const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]); \ - const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]); \ - const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]); \ - \ - const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]); \ - const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]); \ - const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]); \ - const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]); \ - \ - const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]); \ - const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]); \ - const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]); \ - const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]); \ - \ - const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]); \ - const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]); \ - const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]); \ - const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]); \ - \ - MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ - stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, \ - stp1_30) \ - MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \ - stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \ - MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ - stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ - stp1_21, stp1_26) \ - MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ - stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ - stp1_23, stp1_24) \ - } \ - \ - /* Stage2 */ \ - { \ - const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]); \ - const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]); \ - const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]); \ - const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]); \ - \ - const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]); \ - const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]); \ - const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]); \ - const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]); \ - \ - MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ - stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ - stp2_14) \ - MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ - stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ - stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ - stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ - \ - stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ - stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ - stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ - stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ - \ - stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ - stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ - stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ - } \ - \ - /* Stage3 */ \ - { \ - const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]); \ - const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]); \ - const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]); \ - const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]); \ - \ - const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ - const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - \ - MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ - stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ - stp1_6) \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ - stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ - stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ - stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ - stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ - \ - MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ - stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, \ - stp1_29) \ - MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ - stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \ - stp1_25) \ - \ - stp1_16 = stp2_16; \ - stp1_31 = stp2_31; \ - stp1_19 = stp2_19; \ - stp1_20 = stp2_20; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_27 = stp2_27; \ - stp1_28 = stp2_28; \ - } \ - \ - /* Stage4 */ \ - { \ - const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]); \ - const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]); \ - const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]); \ - const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]); \ - \ - const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ - const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - \ - MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \ - stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) \ - \ - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ - \ - MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ - stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, \ - stp2_13) \ - \ - stp2_8 = stp1_8; \ - stp2_15 = stp1_15; \ - stp2_11 = stp1_11; \ - stp2_12 = stp1_12; \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ - stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ - stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ - stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ - stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ - stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ - \ - stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ - stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ - stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ - stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ - stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ - } \ - \ - /* Stage5 */ \ - { \ - const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ - const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ - const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ - const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ - \ - const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ - const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ - \ - tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ - tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ - tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ - tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - \ - stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ - stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ - \ - stp1_4 = stp2_4; \ - stp1_7 = stp2_7; \ - \ - stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ - stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ - stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ - stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ - stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - \ - MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ - stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, \ - stp1_28) \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ - stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - \ - stp1_22 = stp2_22; \ - stp1_23 = stp2_23; \ - stp1_24 = stp2_24; \ - stp1_25 = stp2_25; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } \ - \ - /* Stage6 */ \ - { \ - const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ - const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ - const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ - const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ - \ - stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ - stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ - stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ - stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ - stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ - stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ - stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ - stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ - \ - stp2_8 = stp1_8; \ - stp2_9 = stp1_9; \ - stp2_14 = stp1_14; \ - stp2_15 = stp1_15; \ - \ - MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, \ - stp2_12) \ - \ - stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ - stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ - stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ - stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ - stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ - stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ - stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ - stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ - \ - stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ - stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ - stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ - stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ - stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ - stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ - stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ - stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ - } \ - \ - /* Stage7 */ \ - { \ - const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ - const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ - const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ - const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ - \ - const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ - const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ - const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ - const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ - \ - stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ - stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ - stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ - stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ - stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ - stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ - stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ - stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ - stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ - stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ - stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ - stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ - stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ - stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ - stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ - stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ - \ - stp1_16 = stp2_16; \ - stp1_17 = stp2_17; \ - stp1_18 = stp2_18; \ - stp1_19 = stp2_19; \ - \ - MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, \ - stp1_26) \ - MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ - stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, \ - stp1_24) \ - \ - stp1_28 = stp2_28; \ - stp1_29 = stp2_29; \ - stp1_30 = stp2_30; \ - stp1_31 = stp2_31; \ - } - -// Only upper-left 8x8 has non-zero coeff -void aom_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[32]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); - - for (i = 8; i < 32; ++i) { - in[i] = _mm_setzero_si128(); - } - - array_transpose_8x8(in, in); - // TODO(hkuang): Following transposes are unnecessary. But remove them will - // lead to performance drop on some devices. - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32_34 - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[0] = _mm_add_epi16(stp1_0, stp1_31); - col[1] = _mm_add_epi16(stp1_1, stp1_30); - col[2] = _mm_add_epi16(stp1_2, stp1_29); - col[3] = _mm_add_epi16(stp1_3, stp1_28); - col[4] = _mm_add_epi16(stp1_4, stp1_27); - col[5] = _mm_add_epi16(stp1_5, stp1_26); - col[6] = _mm_add_epi16(stp1_6, stp1_25); - col[7] = _mm_add_epi16(stp1_7, stp1_24); - col[8] = _mm_add_epi16(stp1_8, stp1_23); - col[9] = _mm_add_epi16(stp1_9, stp1_22); - col[10] = _mm_add_epi16(stp1_10, stp1_21); - col[11] = _mm_add_epi16(stp1_11, stp1_20); - col[12] = _mm_add_epi16(stp1_12, stp1_19); - col[13] = _mm_add_epi16(stp1_13, stp1_18); - col[14] = _mm_add_epi16(stp1_14, stp1_17); - col[15] = _mm_add_epi16(stp1_15, stp1_16); - col[16] = _mm_sub_epi16(stp1_15, stp1_16); - col[17] = _mm_sub_epi16(stp1_14, stp1_17); - col[18] = _mm_sub_epi16(stp1_13, stp1_18); - col[19] = _mm_sub_epi16(stp1_12, stp1_19); - col[20] = _mm_sub_epi16(stp1_11, stp1_20); - col[21] = _mm_sub_epi16(stp1_10, stp1_21); - col[22] = _mm_sub_epi16(stp1_9, stp1_22); - col[23] = _mm_sub_epi16(stp1_8, stp1_23); - col[24] = _mm_sub_epi16(stp1_7, stp1_24); - col[25] = _mm_sub_epi16(stp1_6, stp1_25); - col[26] = _mm_sub_epi16(stp1_5, stp1_26); - col[27] = _mm_sub_epi16(stp1_4, stp1_27); - col[28] = _mm_sub_epi16(stp1_3, stp1_28); - col[29] = _mm_sub_epi16(stp1_2, stp1_29); - col[30] = _mm_sub_epi16(stp1_1, stp1_30); - col[31] = _mm_sub_epi16(stp1_0, stp1_31); - for (i = 0; i < 4; i++) { - int j; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - IDCT32_34 - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void aom_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in[32], col[128], zero_idx[16]; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j, i32; - - for (i = 0; i < 4; i++) { - i32 = (i << 5); - // First 1-D idct - // Load input data. - LOAD_DQCOEFF(in[0], input); - LOAD_DQCOEFF(in[8], input); - LOAD_DQCOEFF(in[16], input); - LOAD_DQCOEFF(in[24], input); - LOAD_DQCOEFF(in[1], input); - LOAD_DQCOEFF(in[9], input); - LOAD_DQCOEFF(in[17], input); - LOAD_DQCOEFF(in[25], input); - LOAD_DQCOEFF(in[2], input); - LOAD_DQCOEFF(in[10], input); - LOAD_DQCOEFF(in[18], input); - LOAD_DQCOEFF(in[26], input); - LOAD_DQCOEFF(in[3], input); - LOAD_DQCOEFF(in[11], input); - LOAD_DQCOEFF(in[19], input); - LOAD_DQCOEFF(in[27], input); - - LOAD_DQCOEFF(in[4], input); - LOAD_DQCOEFF(in[12], input); - LOAD_DQCOEFF(in[20], input); - LOAD_DQCOEFF(in[28], input); - LOAD_DQCOEFF(in[5], input); - LOAD_DQCOEFF(in[13], input); - LOAD_DQCOEFF(in[21], input); - LOAD_DQCOEFF(in[29], input); - LOAD_DQCOEFF(in[6], input); - LOAD_DQCOEFF(in[14], input); - LOAD_DQCOEFF(in[22], input); - LOAD_DQCOEFF(in[30], input); - LOAD_DQCOEFF(in[7], input); - LOAD_DQCOEFF(in[15], input); - LOAD_DQCOEFF(in[23], input); - LOAD_DQCOEFF(in[31], input); - - // checking if all entries are zero - zero_idx[0] = _mm_or_si128(in[0], in[1]); - zero_idx[1] = _mm_or_si128(in[2], in[3]); - zero_idx[2] = _mm_or_si128(in[4], in[5]); - zero_idx[3] = _mm_or_si128(in[6], in[7]); - zero_idx[4] = _mm_or_si128(in[8], in[9]); - zero_idx[5] = _mm_or_si128(in[10], in[11]); - zero_idx[6] = _mm_or_si128(in[12], in[13]); - zero_idx[7] = _mm_or_si128(in[14], in[15]); - zero_idx[8] = _mm_or_si128(in[16], in[17]); - zero_idx[9] = _mm_or_si128(in[18], in[19]); - zero_idx[10] = _mm_or_si128(in[20], in[21]); - zero_idx[11] = _mm_or_si128(in[22], in[23]); - zero_idx[12] = _mm_or_si128(in[24], in[25]); - zero_idx[13] = _mm_or_si128(in[26], in[27]); - zero_idx[14] = _mm_or_si128(in[28], in[29]); - zero_idx[15] = _mm_or_si128(in[30], in[31]); - - zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); - zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); - - zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); - zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); - zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); - zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); - zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); - zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); - zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); - - if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) { - col[i32 + 0] = _mm_setzero_si128(); - col[i32 + 1] = _mm_setzero_si128(); - col[i32 + 2] = _mm_setzero_si128(); - col[i32 + 3] = _mm_setzero_si128(); - col[i32 + 4] = _mm_setzero_si128(); - col[i32 + 5] = _mm_setzero_si128(); - col[i32 + 6] = _mm_setzero_si128(); - col[i32 + 7] = _mm_setzero_si128(); - col[i32 + 8] = _mm_setzero_si128(); - col[i32 + 9] = _mm_setzero_si128(); - col[i32 + 10] = _mm_setzero_si128(); - col[i32 + 11] = _mm_setzero_si128(); - col[i32 + 12] = _mm_setzero_si128(); - col[i32 + 13] = _mm_setzero_si128(); - col[i32 + 14] = _mm_setzero_si128(); - col[i32 + 15] = _mm_setzero_si128(); - col[i32 + 16] = _mm_setzero_si128(); - col[i32 + 17] = _mm_setzero_si128(); - col[i32 + 18] = _mm_setzero_si128(); - col[i32 + 19] = _mm_setzero_si128(); - col[i32 + 20] = _mm_setzero_si128(); - col[i32 + 21] = _mm_setzero_si128(); - col[i32 + 22] = _mm_setzero_si128(); - col[i32 + 23] = _mm_setzero_si128(); - col[i32 + 24] = _mm_setzero_si128(); - col[i32 + 25] = _mm_setzero_si128(); - col[i32 + 26] = _mm_setzero_si128(); - col[i32 + 27] = _mm_setzero_si128(); - col[i32 + 28] = _mm_setzero_si128(); - col[i32 + 29] = _mm_setzero_si128(); - col[i32 + 30] = _mm_setzero_si128(); - col[i32 + 31] = _mm_setzero_si128(); - continue; - } - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - IDCT32(in, in + 16) - - // 1_D: Store 32 intermediate results for each 8x32 block. - col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); - } - for (i = 0; i < 4; i++) { - // Second 1-D idct - j = i << 3; - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); - - IDCT32(in, in + 16) - - // 2_D: Calculate the results and store them to destination. - in[0] = _mm_add_epi16(stp1_0, stp1_31); - in[1] = _mm_add_epi16(stp1_1, stp1_30); - in[2] = _mm_add_epi16(stp1_2, stp1_29); - in[3] = _mm_add_epi16(stp1_3, stp1_28); - in[4] = _mm_add_epi16(stp1_4, stp1_27); - in[5] = _mm_add_epi16(stp1_5, stp1_26); - in[6] = _mm_add_epi16(stp1_6, stp1_25); - in[7] = _mm_add_epi16(stp1_7, stp1_24); - in[8] = _mm_add_epi16(stp1_8, stp1_23); - in[9] = _mm_add_epi16(stp1_9, stp1_22); - in[10] = _mm_add_epi16(stp1_10, stp1_21); - in[11] = _mm_add_epi16(stp1_11, stp1_20); - in[12] = _mm_add_epi16(stp1_12, stp1_19); - in[13] = _mm_add_epi16(stp1_13, stp1_18); - in[14] = _mm_add_epi16(stp1_14, stp1_17); - in[15] = _mm_add_epi16(stp1_15, stp1_16); - in[16] = _mm_sub_epi16(stp1_15, stp1_16); - in[17] = _mm_sub_epi16(stp1_14, stp1_17); - in[18] = _mm_sub_epi16(stp1_13, stp1_18); - in[19] = _mm_sub_epi16(stp1_12, stp1_19); - in[20] = _mm_sub_epi16(stp1_11, stp1_20); - in[21] = _mm_sub_epi16(stp1_10, stp1_21); - in[22] = _mm_sub_epi16(stp1_9, stp1_22); - in[23] = _mm_sub_epi16(stp1_8, stp1_23); - in[24] = _mm_sub_epi16(stp1_7, stp1_24); - in[25] = _mm_sub_epi16(stp1_6, stp1_25); - in[26] = _mm_sub_epi16(stp1_5, stp1_26); - in[27] = _mm_sub_epi16(stp1_4, stp1_27); - in[28] = _mm_sub_epi16(stp1_3, stp1_28); - in[29] = _mm_sub_epi16(stp1_2, stp1_29); - in[30] = _mm_sub_epi16(stp1_1, stp1_30); - in[31] = _mm_sub_epi16(stp1_0, stp1_31); - - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i dc_value; - const __m128i zero = _mm_setzero_si128(); - int a, j; - - a = (int)dct_const_round_shift(input[0] * cospi_16_64); - a = (int)dct_const_round_shift(a * cospi_16_64); - a = ROUND_POWER_OF_TWO(a, 6); - - if (a == 0) return; - - dc_value = _mm_set1_epi16(a); - - for (j = 0; j < 32; ++j) { - RECON_AND_STORE(dest + 0 + j * stride, dc_value); - RECON_AND_STORE(dest + 8 + j * stride, dc_value); - RECON_AND_STORE(dest + 16 + j * stride, dc_value); - RECON_AND_STORE(dest + 24 + j * stride, dc_value); - } -} - -// Apply a 32-element IDCT to 8 columns. This does not do any transposition -// of its input - the caller is expected to have done that. -// The input buffers are the top and bottom halves of an 8x32 block. -void idct32_8col(__m128i *in0, __m128i *in1) { - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - - // idct constants for each stage - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, - stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, - stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, - stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, - stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, - stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, - stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - - IDCT32(in0, in1) - - // 2_D: Calculate the results and store them to destination. - in0[0] = _mm_add_epi16(stp1_0, stp1_31); - in0[1] = _mm_add_epi16(stp1_1, stp1_30); - in0[2] = _mm_add_epi16(stp1_2, stp1_29); - in0[3] = _mm_add_epi16(stp1_3, stp1_28); - in0[4] = _mm_add_epi16(stp1_4, stp1_27); - in0[5] = _mm_add_epi16(stp1_5, stp1_26); - in0[6] = _mm_add_epi16(stp1_6, stp1_25); - in0[7] = _mm_add_epi16(stp1_7, stp1_24); - in0[8] = _mm_add_epi16(stp1_8, stp1_23); - in0[9] = _mm_add_epi16(stp1_9, stp1_22); - in0[10] = _mm_add_epi16(stp1_10, stp1_21); - in0[11] = _mm_add_epi16(stp1_11, stp1_20); - in0[12] = _mm_add_epi16(stp1_12, stp1_19); - in0[13] = _mm_add_epi16(stp1_13, stp1_18); - in0[14] = _mm_add_epi16(stp1_14, stp1_17); - in0[15] = _mm_add_epi16(stp1_15, stp1_16); - in1[0] = _mm_sub_epi16(stp1_15, stp1_16); - in1[1] = _mm_sub_epi16(stp1_14, stp1_17); - in1[2] = _mm_sub_epi16(stp1_13, stp1_18); - in1[3] = _mm_sub_epi16(stp1_12, stp1_19); - in1[4] = _mm_sub_epi16(stp1_11, stp1_20); - in1[5] = _mm_sub_epi16(stp1_10, stp1_21); - in1[6] = _mm_sub_epi16(stp1_9, stp1_22); - in1[7] = _mm_sub_epi16(stp1_8, stp1_23); - in1[8] = _mm_sub_epi16(stp1_7, stp1_24); - in1[9] = _mm_sub_epi16(stp1_6, stp1_25); - in1[10] = _mm_sub_epi16(stp1_5, stp1_26); - in1[11] = _mm_sub_epi16(stp1_4, stp1_27); - in1[12] = _mm_sub_epi16(stp1_3, stp1_28); - in1[13] = _mm_sub_epi16(stp1_2, stp1_29); - in1[14] = _mm_sub_epi16(stp1_1, stp1_30); - in1[15] = _mm_sub_epi16(stp1_0, stp1_31); -} diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h b/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h deleted file mode 100644 index 342816977..000000000 --- a/third_party/aom/aom_dsp/x86/inv_txfm_sse2.h +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_ -#define AOM_DSP_X86_INV_TXFM_SSE2_H_ - -#include // SSE2 -#include "./aom_config.h" -#include "aom/aom_integer.h" -#include "aom_dsp/inv_txfm.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -// perform 8x8 transpose -static INLINE void array_transpose_4x4(__m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); - res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); -} - -static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); -} - -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } - -#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - \ - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ - in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ - } - -static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - - out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4); - out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4); - out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6); - out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6); -} - -static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8(res0, res0); - array_transpose_8x8(res1, tbuf); - array_transpose_8x8(res0 + 8, res1); - array_transpose_8x8(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -// Function to allow 8 bit optimisations to be used when profile 0 is used with -// highbitdepth enabled -static INLINE __m128i load_input_data(const tran_low_t *data) { - if (sizeof(tran_low_t) == 4) { - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); - } else { - return _mm_load_si128((const __m128i *)data); - } -} - -static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) { - in[0] = load_input_data(input + 0 * 16); - in[1] = load_input_data(input + 1 * 16); - in[2] = load_input_data(input + 2 * 16); - in[3] = load_input_data(input + 3 * 16); - in[4] = load_input_data(input + 4 * 16); - in[5] = load_input_data(input + 5 * 16); - in[6] = load_input_data(input + 6 * 16); - in[7] = load_input_data(input + 7 * 16); - - in[8] = load_input_data(input + 8 * 16); - in[9] = load_input_data(input + 9 * 16); - in[10] = load_input_data(input + 10 * 16); - in[11] = load_input_data(input + 11 * 16); - in[12] = load_input_data(input + 12 * 16); - in[13] = load_input_data(input + 13 * 16); - in[14] = load_input_data(input + 14 * 16); - in[15] = load_input_data(input + 15 * 16); -} - -#define RECON_AND_STORE(dest, in_x) \ - { \ - __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ - d0 = _mm_unpacklo_epi8(d0, zero); \ - d0 = _mm_add_epi16(in_x, d0); \ - d0 = _mm_packus_epi16(d0, d0); \ - _mm_storel_epi64((__m128i *)(dest), d0); \ - } - -static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - // Final rounding and shift - in[0] = _mm_adds_epi16(in[0], final_rounding); - in[1] = _mm_adds_epi16(in[1], final_rounding); - in[2] = _mm_adds_epi16(in[2], final_rounding); - in[3] = _mm_adds_epi16(in[3], final_rounding); - in[4] = _mm_adds_epi16(in[4], final_rounding); - in[5] = _mm_adds_epi16(in[5], final_rounding); - in[6] = _mm_adds_epi16(in[6], final_rounding); - in[7] = _mm_adds_epi16(in[7], final_rounding); - in[8] = _mm_adds_epi16(in[8], final_rounding); - in[9] = _mm_adds_epi16(in[9], final_rounding); - in[10] = _mm_adds_epi16(in[10], final_rounding); - in[11] = _mm_adds_epi16(in[11], final_rounding); - in[12] = _mm_adds_epi16(in[12], final_rounding); - in[13] = _mm_adds_epi16(in[13], final_rounding); - in[14] = _mm_adds_epi16(in[14], final_rounding); - in[15] = _mm_adds_epi16(in[15], final_rounding); - - in[0] = _mm_srai_epi16(in[0], 6); - in[1] = _mm_srai_epi16(in[1], 6); - in[2] = _mm_srai_epi16(in[2], 6); - in[3] = _mm_srai_epi16(in[3], 6); - in[4] = _mm_srai_epi16(in[4], 6); - in[5] = _mm_srai_epi16(in[5], 6); - in[6] = _mm_srai_epi16(in[6], 6); - in[7] = _mm_srai_epi16(in[7], 6); - in[8] = _mm_srai_epi16(in[8], 6); - in[9] = _mm_srai_epi16(in[9], 6); - in[10] = _mm_srai_epi16(in[10], 6); - in[11] = _mm_srai_epi16(in[11], 6); - in[12] = _mm_srai_epi16(in[12], 6); - in[13] = _mm_srai_epi16(in[13], 6); - in[14] = _mm_srai_epi16(in[14], 6); - in[15] = _mm_srai_epi16(in[15], 6); - - RECON_AND_STORE(dest + 0 * stride, in[0]); - RECON_AND_STORE(dest + 1 * stride, in[1]); - RECON_AND_STORE(dest + 2 * stride, in[2]); - RECON_AND_STORE(dest + 3 * stride, in[3]); - RECON_AND_STORE(dest + 4 * stride, in[4]); - RECON_AND_STORE(dest + 5 * stride, in[5]); - RECON_AND_STORE(dest + 6 * stride, in[6]); - RECON_AND_STORE(dest + 7 * stride, in[7]); - RECON_AND_STORE(dest + 8 * stride, in[8]); - RECON_AND_STORE(dest + 9 * stride, in[9]); - RECON_AND_STORE(dest + 10 * stride, in[10]); - RECON_AND_STORE(dest + 11 * stride, in[11]); - RECON_AND_STORE(dest + 12 * stride, in[12]); - RECON_AND_STORE(dest + 13 * stride, in[13]); - RECON_AND_STORE(dest + 14 * stride, in[14]); - RECON_AND_STORE(dest + 15 * stride, in[15]); -} - -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - } - -#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - } - -void iadst16_8col(__m128i *in); -void idct16_8col(__m128i *in); -void aom_idct4_sse2(__m128i *in); -void aom_idct8_sse2(__m128i *in); -void aom_idct16_sse2(__m128i *in0, __m128i *in1); -void aom_iadst4_sse2(__m128i *in); -void aom_iadst8_sse2(__m128i *in); -void aom_iadst16_sse2(__m128i *in0, __m128i *in1); -void idct32_8col(__m128i *in0, __m128i *in1); - -#endif // AOM_DSP_X86_INV_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c b/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c deleted file mode 100644 index 9d006797b..000000000 --- a/third_party/aom/aom_dsp/x86/inv_txfm_ssse3.c +++ /dev/null @@ -1,1333 +0,0 @@ -/* - * Copyright (c) 2017 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/x86/inv_txfm_sse2.h" -#include "aom_dsp/x86/txfm_common_sse2.h" - -void aom_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i; - - // Load input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - in4 = load_input_data(input + 8 * 4); - in5 = load_input_data(input + 8 * 5); - in6 = load_input_data(input + 8 * 6); - in7 = load_input_data(input + 8 * 7); - - // 2-D - for (i = 0; i < 2; i++) { - // 8x8 Transpose is copied from vpx_fdct8x8_sse2() - TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, - in4, in5, in6, in7); - - // 4-stage 1D idct8x8 - { - /* Stage1 */ - { - const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); - const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); - const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); - const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); - - { - tmp0 = _mm_madd_epi16(lo_17, stg1_0); - tmp1 = _mm_madd_epi16(hi_17, stg1_0); - tmp2 = _mm_madd_epi16(lo_17, stg1_1); - tmp3 = _mm_madd_epi16(hi_17, stg1_1); - tmp4 = _mm_madd_epi16(lo_35, stg1_2); - tmp5 = _mm_madd_epi16(hi_35, stg1_2); - tmp6 = _mm_madd_epi16(lo_35, stg1_3); - tmp7 = _mm_madd_epi16(hi_35, stg1_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - tmp6 = _mm_add_epi32(tmp6, rounding); - tmp7 = _mm_add_epi32(tmp7, rounding); - - tmp0 = _mm_srai_epi32(tmp0, 14); - tmp1 = _mm_srai_epi32(tmp1, 14); - tmp2 = _mm_srai_epi32(tmp2, 14); - tmp3 = _mm_srai_epi32(tmp3, 14); - tmp4 = _mm_srai_epi32(tmp4, 14); - tmp5 = _mm_srai_epi32(tmp5, 14); - tmp6 = _mm_srai_epi32(tmp6, 14); - tmp7 = _mm_srai_epi32(tmp7, 14); - - stp1_4 = _mm_packs_epi32(tmp0, tmp1); - stp1_7 = _mm_packs_epi32(tmp2, tmp3); - stp1_5 = _mm_packs_epi32(tmp4, tmp5); - stp1_6 = _mm_packs_epi32(tmp6, tmp7); - } - } - - /* Stage2 */ - { - const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); - const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); - - { - tmp0 = _mm_unpacklo_epi16(in0, in4); - tmp1 = _mm_unpackhi_epi16(in0, in4); - - tmp2 = _mm_madd_epi16(tmp0, stk2_0); - tmp3 = _mm_madd_epi16(tmp1, stk2_0); - tmp4 = _mm_madd_epi16(tmp0, stk2_1); - tmp5 = _mm_madd_epi16(tmp1, stk2_1); - - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - - stp2_0 = _mm_packs_epi32(tmp2, tmp3); - stp2_1 = _mm_packs_epi32(tmp4, tmp5); - - tmp0 = _mm_madd_epi16(lo_26, stg2_2); - tmp1 = _mm_madd_epi16(hi_26, stg2_2); - tmp2 = _mm_madd_epi16(lo_26, stg2_3); - tmp3 = _mm_madd_epi16(hi_26, stg2_3); - - tmp0 = _mm_add_epi32(tmp0, rounding); - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - - tmp0 = _mm_srai_epi32(tmp0, 14); - tmp1 = _mm_srai_epi32(tmp1, 14); - tmp2 = _mm_srai_epi32(tmp2, 14); - tmp3 = _mm_srai_epi32(tmp3, 14); - - stp2_2 = _mm_packs_epi32(tmp0, tmp1); - stp2_3 = _mm_packs_epi32(tmp2, tmp3); - } - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); - } - - /* Stage3 */ - { - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - - tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); - tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); - - tmp2 = _mm_madd_epi16(tmp0, stk2_1); - tmp3 = _mm_madd_epi16(tmp1, stk2_1); - tmp4 = _mm_madd_epi16(tmp0, stk2_0); - tmp5 = _mm_madd_epi16(tmp1, stk2_0); - - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp4 = _mm_add_epi32(tmp4, rounding); - tmp5 = _mm_add_epi32(tmp5, rounding); - - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp2, tmp3); - stp1_6 = _mm_packs_epi32(tmp4, tmp5); - } - - /* Stage4 */ - in0 = _mm_add_epi16(stp1_0, stp2_7); - in1 = _mm_add_epi16(stp1_1, stp1_6); - in2 = _mm_add_epi16(stp1_2, stp1_5); - in3 = _mm_add_epi16(stp1_3, stp2_4); - in4 = _mm_sub_epi16(stp1_3, stp2_4); - in5 = _mm_sub_epi16(stp1_2, stp1_5); - in6 = _mm_sub_epi16(stp1_1, stp1_6); - in7 = _mm_sub_epi16(stp1_0, stp2_7); - } - } - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -void aom_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i final_rounding = _mm_set1_epi16(1 << 4); - const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; - __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; - __m128i tmp0, tmp1, tmp2, tmp3; - - // Rows. Load 4-row input data. - in0 = load_input_data(input); - in1 = load_input_data(input + 8 * 1); - in2 = load_input_data(input + 8 * 2); - in3 = load_input_data(input + 8 * 3); - - // 8x4 Transpose - TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); - - // Stage1 - tmp0 = _mm_mulhrs_epi16(in0, stg1_0); - tmp1 = _mm_mulhrs_epi16(in0, stg1_1); - tmp2 = _mm_mulhrs_epi16(in1, stg1_2); - tmp3 = _mm_mulhrs_epi16(in1, stg1_3); - - stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1); - stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3); - - // Stage2 - tmp0 = _mm_mulhrs_epi16(in0, stg2_0); - stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0); - - tmp1 = _mm_mulhrs_epi16(in1, stg2_2); - tmp2 = _mm_mulhrs_epi16(in1, stg2_3); - stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1); - - tmp0 = _mm_add_epi16(stp1_4, stp1_5); - tmp1 = _mm_sub_epi16(stp1_4, stp1_5); - - stp2_4 = tmp0; - stp2_5 = _mm_unpacklo_epi64(tmp1, zero); - stp2_6 = _mm_unpackhi_epi64(tmp1, zero); - - tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6); - tmp1 = _mm_madd_epi16(tmp0, stg3_0); - tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0 - - tmp1 = _mm_add_epi32(tmp1, rounding); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - - stp1_5 = _mm_packs_epi32(tmp1, tmp2); - - // Stage3 - tmp2 = _mm_add_epi16(stp2_0, stp2_2); - tmp3 = _mm_sub_epi16(stp2_0, stp2_2); - - stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2); - stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2); - - // Stage4 - tmp0 = _mm_add_epi16(stp1_3, stp2_4); - tmp1 = _mm_add_epi16(stp1_2, stp1_5); - tmp2 = _mm_sub_epi16(stp1_3, stp2_4); - tmp3 = _mm_sub_epi16(stp1_2, stp1_5); - - TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) - - /* Stage1 */ - stp1_4 = _mm_mulhrs_epi16(in1, stg1_0); - stp1_7 = _mm_mulhrs_epi16(in1, stg1_1); - stp1_5 = _mm_mulhrs_epi16(in3, stg1_2); - stp1_6 = _mm_mulhrs_epi16(in3, stg1_3); - - /* Stage2 */ - stp2_0 = _mm_mulhrs_epi16(in0, stg2_0); - stp2_1 = _mm_mulhrs_epi16(in0, stg2_0); - - stp2_2 = _mm_mulhrs_epi16(in2, stg2_2); - stp2_3 = _mm_mulhrs_epi16(in2, stg2_3); - - stp2_4 = _mm_add_epi16(stp1_4, stp1_5); - stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); - stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); - stp2_7 = _mm_add_epi16(stp1_7, stp1_6); - - /* Stage3 */ - stp1_0 = _mm_add_epi16(stp2_0, stp2_3); - stp1_1 = _mm_add_epi16(stp2_1, stp2_2); - stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); - stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); - - tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); - tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); - - tmp2 = _mm_madd_epi16(tmp0, stk2_0); - tmp3 = _mm_madd_epi16(tmp1, stk2_0); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - stp1_6 = _mm_packs_epi32(tmp2, tmp3); - - tmp2 = _mm_madd_epi16(tmp0, stk2_1); - tmp3 = _mm_madd_epi16(tmp1, stk2_1); - tmp2 = _mm_add_epi32(tmp2, rounding); - tmp3 = _mm_add_epi32(tmp3, rounding); - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); - stp1_5 = _mm_packs_epi32(tmp2, tmp3); - - /* Stage4 */ - in0 = _mm_add_epi16(stp1_0, stp2_7); - in1 = _mm_add_epi16(stp1_1, stp1_6); - in2 = _mm_add_epi16(stp1_2, stp1_5); - in3 = _mm_add_epi16(stp1_3, stp2_4); - in4 = _mm_sub_epi16(stp1_3, stp2_4); - in5 = _mm_sub_epi16(stp1_2, stp1_5); - in6 = _mm_sub_epi16(stp1_1, stp1_6); - in7 = _mm_sub_epi16(stp1_0, stp2_7); - - // Final rounding and shift - in0 = _mm_adds_epi16(in0, final_rounding); - in1 = _mm_adds_epi16(in1, final_rounding); - in2 = _mm_adds_epi16(in2, final_rounding); - in3 = _mm_adds_epi16(in3, final_rounding); - in4 = _mm_adds_epi16(in4, final_rounding); - in5 = _mm_adds_epi16(in5, final_rounding); - in6 = _mm_adds_epi16(in6, final_rounding); - in7 = _mm_adds_epi16(in7, final_rounding); - - in0 = _mm_srai_epi16(in0, 5); - in1 = _mm_srai_epi16(in1, 5); - in2 = _mm_srai_epi16(in2, 5); - in3 = _mm_srai_epi16(in3, 5); - in4 = _mm_srai_epi16(in4, 5); - in5 = _mm_srai_epi16(in5, 5); - in6 = _mm_srai_epi16(in6, 5); - in7 = _mm_srai_epi16(in7, 5); - - RECON_AND_STORE(dest + 0 * stride, in0); - RECON_AND_STORE(dest + 1 * stride, in1); - RECON_AND_STORE(dest + 2 * stride, in2); - RECON_AND_STORE(dest + 3 * stride, in3); - RECON_AND_STORE(dest + 4 * stride, in4); - RECON_AND_STORE(dest + 5 * stride, in5); - RECON_AND_STORE(dest + 6 * stride, in6); - RECON_AND_STORE(dest + 7 * stride, in7); -} - -// Only do addition and subtraction butterfly, size = 16, 32 -static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out, - int size) { - int i = 0; - const int num = size >> 1; - const int bound = size - 1; - while (i < num) { - out[i] = _mm_add_epi16(in[i], in[bound - i]); - out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]); - i++; - } -} - -#define BUTTERFLY_PAIR(x0, x1, co0, co1) \ - do { \ - tmp0 = _mm_madd_epi16(x0, co0); \ - tmp1 = _mm_madd_epi16(x1, co0); \ - tmp2 = _mm_madd_epi16(x0, co1); \ - tmp3 = _mm_madd_epi16(x1, co1); \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - } while (0) - -static INLINE void butterfly(const __m128i *x0, const __m128i *x1, - const __m128i *c0, const __m128i *c1, __m128i *y0, - __m128i *y1) { - __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm_unpacklo_epi16(*x0, *x1); - u1 = _mm_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *y0 = _mm_packs_epi32(tmp0, tmp1); - *y1 = _mm_packs_epi32(tmp2, tmp3); -} - -static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0, - const __m128i *c1) { - __m128i tmp0, tmp1, tmp2, tmp3, u0, u1; - const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); - - u0 = _mm_unpacklo_epi16(*x0, *x1); - u1 = _mm_unpackhi_epi16(*x0, *x1); - BUTTERFLY_PAIR(u0, u1, *c0, *c1); - *x0 = _mm_packs_epi32(tmp0, tmp1); - *x1 = _mm_packs_epi32(tmp2, tmp3); -} - -static void idct32_34_first_half(const __m128i *in, __m128i *stp1) { - const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - - const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i x0, x1, x4, x5, x6, x7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - - // phase 1 - - // 0, 15 - u2 = _mm_mulhrs_epi16(in[2], stk2_1); // stp2_15 - u3 = _mm_mulhrs_epi16(in[6], stk2_7); // stp2_12 - v15 = _mm_add_epi16(u2, u3); - // in[0], in[4] - x0 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[0] - x7 = _mm_mulhrs_epi16(in[4], stk3_1); // stp1[7] - v0 = _mm_add_epi16(x0, x7); // stp2_0 - stp1[0] = _mm_add_epi16(v0, v15); - stp1[15] = _mm_sub_epi16(v0, v15); - - // in[2], in[6] - u0 = _mm_mulhrs_epi16(in[2], stk2_0); // stp2_8 - u1 = _mm_mulhrs_epi16(in[6], stk2_6); // stp2_11 - butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5); // stp2_9, stp2_14 - butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7); // stp2_10, stp2_13 - - v8 = _mm_add_epi16(u0, u1); - v9 = _mm_add_epi16(u4, u6); - v10 = _mm_sub_epi16(u4, u6); - v11 = _mm_sub_epi16(u0, u1); - v12 = _mm_sub_epi16(u2, u3); - v13 = _mm_sub_epi16(u5, u7); - v14 = _mm_add_epi16(u5, u7); - - butterfly_self(&v10, &v13, &stg6_0, &stg4_0); - butterfly_self(&v11, &v12, &stg6_0, &stg4_0); - - // 1, 14 - x1 = _mm_mulhrs_epi16(in[0], stk4_0); // stp1[1], stk4_1 = stk4_0 - // stp1[2] = stp1[0], stp1[3] = stp1[1] - x4 = _mm_mulhrs_epi16(in[4], stk3_0); // stp1[4] - butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6); - v1 = _mm_add_epi16(x1, x6); // stp2_1 - v2 = _mm_add_epi16(x0, x5); // stp2_2 - stp1[1] = _mm_add_epi16(v1, v14); - stp1[14] = _mm_sub_epi16(v1, v14); - - stp1[2] = _mm_add_epi16(v2, v13); - stp1[13] = _mm_sub_epi16(v2, v13); - - v3 = _mm_add_epi16(x1, x4); // stp2_3 - v4 = _mm_sub_epi16(x1, x4); // stp2_4 - - v5 = _mm_sub_epi16(x0, x5); // stp2_5 - - v6 = _mm_sub_epi16(x1, x6); // stp2_6 - v7 = _mm_sub_epi16(x0, x7); // stp2_7 - stp1[3] = _mm_add_epi16(v3, v12); - stp1[12] = _mm_sub_epi16(v3, v12); - - stp1[6] = _mm_add_epi16(v6, v9); - stp1[9] = _mm_sub_epi16(v6, v9); - - stp1[7] = _mm_add_epi16(v7, v8); - stp1[8] = _mm_sub_epi16(v7, v8); - - stp1[4] = _mm_add_epi16(v4, v11); - stp1[11] = _mm_sub_epi16(v4, v11); - - stp1[5] = _mm_add_epi16(v5, v10); - stp1[10] = _mm_sub_epi16(v5, v10); -} - -static void idct32_34_second_half(const __m128i *in, __m128i *stp1) { - const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); - const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); - const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); - const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); - const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); - const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); - const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); - const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - __m128i v16, v17, v18, v19, v20, v21, v22, v23; - __m128i v24, v25, v26, v27, v28, v29, v30, v31; - __m128i u16, u17, u18, u19, u20, u21, u22, u23; - __m128i u24, u25, u26, u27, u28, u29, u30, u31; - - v16 = _mm_mulhrs_epi16(in[1], stk1_0); - v31 = _mm_mulhrs_epi16(in[1], stk1_1); - - v19 = _mm_mulhrs_epi16(in[7], stk1_6); - v28 = _mm_mulhrs_epi16(in[7], stk1_7); - - v20 = _mm_mulhrs_epi16(in[5], stk1_8); - v27 = _mm_mulhrs_epi16(in[5], stk1_9); - - v23 = _mm_mulhrs_epi16(in[3], stk1_14); - v24 = _mm_mulhrs_epi16(in[3], stk1_15); - - butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30); - butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29); - butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26); - butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25); - - u16 = _mm_add_epi16(v16, v19); - u17 = _mm_add_epi16(v17, v18); - u18 = _mm_sub_epi16(v17, v18); - u19 = _mm_sub_epi16(v16, v19); - u20 = _mm_sub_epi16(v23, v20); - u21 = _mm_sub_epi16(v22, v21); - u22 = _mm_add_epi16(v22, v21); - u23 = _mm_add_epi16(v23, v20); - u24 = _mm_add_epi16(v24, v27); - u27 = _mm_sub_epi16(v24, v27); - u25 = _mm_add_epi16(v25, v26); - u26 = _mm_sub_epi16(v25, v26); - u28 = _mm_sub_epi16(v31, v28); - u31 = _mm_add_epi16(v28, v31); - u29 = _mm_sub_epi16(v30, v29); - u30 = _mm_add_epi16(v29, v30); - - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - - stp1[16] = _mm_add_epi16(u16, u23); - stp1[23] = _mm_sub_epi16(u16, u23); - - stp1[17] = _mm_add_epi16(u17, u22); - stp1[22] = _mm_sub_epi16(u17, u22); - - stp1[18] = _mm_add_epi16(u18, u21); - stp1[21] = _mm_sub_epi16(u18, u21); - - stp1[19] = _mm_add_epi16(u19, u20); - stp1[20] = _mm_sub_epi16(u19, u20); - - stp1[24] = _mm_sub_epi16(u31, u24); - stp1[31] = _mm_add_epi16(u24, u31); - - stp1[25] = _mm_sub_epi16(u30, u25); - stp1[30] = _mm_add_epi16(u25, u30); - - stp1[26] = _mm_sub_epi16(u29, u26); - stp1[29] = _mm_add_epi16(u26, u29); - - stp1[27] = _mm_sub_epi16(u28, u27); - stp1[28] = _mm_add_epi16(u27, u28); - - butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0); - butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0); - butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0); - butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0); -} - -// Only upper-left 8x8 has non-zero coeff -void aom_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - const __m128i zero = _mm_setzero_si128(); - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - __m128i in[32], col[32]; - __m128i stp1[32]; - int i; - - // Load input data. Only need to load the top left 8x8 block. - in[0] = load_input_data(input); - in[1] = load_input_data(input + 32); - in[2] = load_input_data(input + 64); - in[3] = load_input_data(input + 96); - in[4] = load_input_data(input + 128); - in[5] = load_input_data(input + 160); - in[6] = load_input_data(input + 192); - in[7] = load_input_data(input + 224); - - array_transpose_8x8(in, in); - idct32_34_first_half(in, stp1); - idct32_34_second_half(in, stp1); - - // 1_D: Store 32 intermediate results for each 8x32 block. - add_sub_butterfly(stp1, col, 32); - for (i = 0; i < 4; i++) { - int j; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + i * 8, in); - idct32_34_first_half(in, stp1); - idct32_34_second_half(in, stp1); - - // 2_D: Calculate the results and store them to destination. - add_sub_butterfly(stp1, in, 32); - for (j = 0; j < 32; ++j) { - // Final rounding and shift - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j] = _mm_srai_epi16(in[j], 6); - RECON_AND_STORE(dest + j * stride, in[j]); - } - - dest += 8; - } -} - -// in0[16] represents the left 8x16 block -// in1[16] represents the right 8x16 block -static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, - __m128i *in1) { - int i; - for (i = 0; i < 16; i++) { - in0[i] = load_input_data(input); - in1[i] = load_input_data(input + 8); - input += 32; - } -} - -static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0, - __m128i *out1) { - array_transpose_8x8(in0, out0); - array_transpose_8x8(&in0[8], out1); - array_transpose_8x8(in1, &out0[8]); - array_transpose_8x8(&in1[8], &out1[8]); -} - -// Group the coefficient calculation into smaller functions -// to prevent stack spillover: -// quarter_1: 0-7 -// quarter_2: 8-15 -// quarter_3_4: 16-23, 24-31 -static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/, - __m128i *out /*out[8]*/) { - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - - { - const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); - const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); - const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); - u0 = _mm_mulhrs_epi16(in[0], stk4_0); - u2 = _mm_mulhrs_epi16(in[8], stk4_2); - u3 = _mm_mulhrs_epi16(in[8], stk4_3); - u1 = u0; - } - - v0 = _mm_add_epi16(u0, u3); - v1 = _mm_add_epi16(u1, u2); - v2 = _mm_sub_epi16(u1, u2); - v3 = _mm_sub_epi16(u0, u3); - - { - const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); - const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); - const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); - const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); - u4 = _mm_mulhrs_epi16(in[4], stk3_0); - u7 = _mm_mulhrs_epi16(in[4], stk3_1); - u5 = _mm_mulhrs_epi16(in[12], stk3_2); - u6 = _mm_mulhrs_epi16(in[12], stk3_3); - } - - v4 = _mm_add_epi16(u4, u5); - v5 = _mm_sub_epi16(u4, u5); - v6 = _mm_sub_epi16(u7, u6); - v7 = _mm_add_epi16(u7, u6); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - } - - out[0] = _mm_add_epi16(v0, v7); - out[1] = _mm_add_epi16(v1, v6); - out[2] = _mm_add_epi16(v2, v5); - out[3] = _mm_add_epi16(v3, v4); - out[4] = _mm_sub_epi16(v3, v4); - out[5] = _mm_sub_epi16(v2, v5); - out[6] = _mm_sub_epi16(v1, v6); - out[7] = _mm_sub_epi16(v0, v7); -} - -static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/, - __m128i *out /*out[8]*/) { - __m128i u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v8, v9, v10, v11, v12, v13, v14, v15; - - { - const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); - const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); - const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); - const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); - const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); - const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); - const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); - const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); - u8 = _mm_mulhrs_epi16(in[2], stk2_0); - u15 = _mm_mulhrs_epi16(in[2], stk2_1); - u9 = _mm_mulhrs_epi16(in[14], stk2_2); - u14 = _mm_mulhrs_epi16(in[14], stk2_3); - u10 = _mm_mulhrs_epi16(in[10], stk2_4); - u13 = _mm_mulhrs_epi16(in[10], stk2_5); - u11 = _mm_mulhrs_epi16(in[6], stk2_6); - u12 = _mm_mulhrs_epi16(in[6], stk2_7); - } - - v8 = _mm_add_epi16(u8, u9); - v9 = _mm_sub_epi16(u8, u9); - v10 = _mm_sub_epi16(u11, u10); - v11 = _mm_add_epi16(u11, u10); - v12 = _mm_add_epi16(u12, u13); - v13 = _mm_sub_epi16(u12, u13); - v14 = _mm_sub_epi16(u15, u14); - v15 = _mm_add_epi16(u15, u14); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(v8, v11); - out[1] = _mm_add_epi16(v9, v10); - out[2] = _mm_sub_epi16(v9, v10); - out[3] = _mm_sub_epi16(v8, v11); - out[4] = _mm_sub_epi16(v15, v12); - out[5] = _mm_sub_epi16(v14, v13); - out[6] = _mm_add_epi16(v14, v13); - out[7] = _mm_add_epi16(v15, v12); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } -} - -// 8x32 block even indexed 8 inputs of in[16], -// output first half 16 to out[32] -static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/, - __m128i *out /*out[32]*/) { - __m128i temp[16]; - idct32_8x32_135_quarter_1(in, temp); - idct32_8x32_135_quarter_2(in, &temp[8]); - add_sub_butterfly(temp, out, 16); -} - -// 8x32 block odd indexed 8 inputs of in[16], -// output second half 16 to out[32] -static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/, - __m128i *out /*out[32]*/) { - __m128i v16, v17, v18, v19, v20, v21, v22, v23; - __m128i v24, v25, v26, v27, v28, v29, v30, v31; - __m128i u16, u17, u18, u19, u20, u21, u22, u23; - __m128i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); - const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); - const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64); - const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64); - - const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64); - const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64); - const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); - const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); - const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); - const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); - const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64); - const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64); - - const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64); - const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64); - const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); - const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); - u16 = _mm_mulhrs_epi16(in[1], stk1_0); - u31 = _mm_mulhrs_epi16(in[1], stk1_1); - u17 = _mm_mulhrs_epi16(in[15], stk1_2); - u30 = _mm_mulhrs_epi16(in[15], stk1_3); - - u18 = _mm_mulhrs_epi16(in[9], stk1_4); - u29 = _mm_mulhrs_epi16(in[9], stk1_5); - u19 = _mm_mulhrs_epi16(in[7], stk1_6); - u28 = _mm_mulhrs_epi16(in[7], stk1_7); - - u20 = _mm_mulhrs_epi16(in[5], stk1_8); - u27 = _mm_mulhrs_epi16(in[5], stk1_9); - u21 = _mm_mulhrs_epi16(in[11], stk1_10); - u26 = _mm_mulhrs_epi16(in[11], stk1_11); - - u22 = _mm_mulhrs_epi16(in[13], stk1_12); - u25 = _mm_mulhrs_epi16(in[13], stk1_13); - u23 = _mm_mulhrs_epi16(in[3], stk1_14); - u24 = _mm_mulhrs_epi16(in[3], stk1_15); - } - - v16 = _mm_add_epi16(u16, u17); - v17 = _mm_sub_epi16(u16, u17); - v18 = _mm_sub_epi16(u19, u18); - v19 = _mm_add_epi16(u19, u18); - - v20 = _mm_add_epi16(u20, u21); - v21 = _mm_sub_epi16(u20, u21); - v22 = _mm_sub_epi16(u23, u22); - v23 = _mm_add_epi16(u23, u22); - - v24 = _mm_add_epi16(u24, u25); - v25 = _mm_sub_epi16(u24, u25); - v26 = _mm_sub_epi16(u27, u26); - v27 = _mm_add_epi16(u27, u26); - - v28 = _mm_add_epi16(u28, u29); - v29 = _mm_sub_epi16(u28, u29); - v30 = _mm_sub_epi16(u31, u30); - v31 = _mm_add_epi16(u31, u30); - - { - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm_add_epi16(v16, v19); - u17 = _mm_add_epi16(v17, v18); - u18 = _mm_sub_epi16(v17, v18); - u19 = _mm_sub_epi16(v16, v19); - u20 = _mm_sub_epi16(v23, v20); - u21 = _mm_sub_epi16(v22, v21); - u22 = _mm_add_epi16(v22, v21); - u23 = _mm_add_epi16(v23, v20); - - u24 = _mm_add_epi16(v24, v27); - u25 = _mm_add_epi16(v25, v26); - u26 = _mm_sub_epi16(v25, v26); - u27 = _mm_sub_epi16(v24, v27); - u28 = _mm_sub_epi16(v31, v28); - u29 = _mm_sub_epi16(v30, v29); - u30 = _mm_add_epi16(v29, v30); - u31 = _mm_add_epi16(v28, v31); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(u16, u23); - out[1] = _mm_add_epi16(u17, u22); - out[2] = _mm_add_epi16(u18, u21); - out[3] = _mm_add_epi16(u19, u20); - v20 = _mm_sub_epi16(u19, u20); - v21 = _mm_sub_epi16(u18, u21); - v22 = _mm_sub_epi16(u17, u22); - v23 = _mm_sub_epi16(u16, u23); - - v24 = _mm_sub_epi16(u31, u24); - v25 = _mm_sub_epi16(u30, u25); - v26 = _mm_sub_epi16(u29, u26); - v27 = _mm_sub_epi16(u28, u27); - out[12] = _mm_add_epi16(u27, u28); - out[13] = _mm_add_epi16(u26, u29); - out[14] = _mm_add_epi16(u25, u30); - out[15] = _mm_add_epi16(u24, u31); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]); - butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]); - butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]); - butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]); - } -} - -// 8x16 block, input __m128i in[16], output __m128i in[32] -static void idct32_8x32_135(__m128i *in /*in[32]*/) { - __m128i out[32]; - idct32_8x32_quarter_1_2(in, out); - idct32_8x32_quarter_3_4(in, &out[16]); - add_sub_butterfly(out, in, 32); -} - -static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { - const __m128i final_rounding = _mm_set1_epi16(1 << 5); - const __m128i zero = _mm_setzero_si128(); - int j = 0; - while (j < 32) { - in[j] = _mm_adds_epi16(in[j], final_rounding); - in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); - - in[j] = _mm_srai_epi16(in[j], 6); - in[j + 1] = _mm_srai_epi16(in[j + 1], 6); - - RECON_AND_STORE(dst, in[j]); - dst += stride; - RECON_AND_STORE(dst, in[j + 1]); - dst += stride; - j += 2; - } -} - -static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest, - int stride) { - store_buffer_8x32(in0, dest, stride); - store_buffer_8x32(in1, dest + 8, stride); -} - -static INLINE void idct32_135(__m128i *col0, __m128i *col1) { - idct32_8x32_135(col0); - idct32_8x32_135(col1); -} - -typedef enum { left_16, right_16 } ColsIndicator; - -static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store, - ColsIndicator cols) { - switch (cols) { - case left_16: { - int i; - array_transpose_16x16(in0, in1); - for (i = 0; i < 16; ++i) { - store[i] = in0[16 + i]; - store[16 + i] = in1[16 + i]; - } - break; - } - case right_16: { - array_transpose_16x16_2(store, &store[16], in0, in1); - break; - } - default: { assert(0); } - } -} - -// Only upper-left 16x16 has non-zero coeff -void aom_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - // Each array represents an 8x32 block - __m128i col0[32], col1[32]; - // This array represents a 16x16 block - __m128i temp[32]; - - // Load input data. Only need to load the top left 16x16 block. - load_buffer_16x16(input, col0, col1); - - // columns - array_transpose_16x16(col0, col1); - idct32_135(col0, col1); - - // rows - transpose_and_copy_16x16(col0, col1, temp, left_16); - idct32_135(col0, col1); - recon_and_store(col0, col1, dest, stride); - - transpose_and_copy_16x16(col0, col1, temp, right_16); - idct32_135(col0, col1); - recon_and_store(col0, col1, dest + 16, stride); -} - -// For each 8x32 block __m128i in[32], -// Input with index, 2, 6, 10, 14, 18, 22, 26, 30 -// output pixels: 8-15 in __m128i in[32] -static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/, - __m128i *out /*out[16]*/) { - __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_ - __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_ - - { - const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); - butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15); - butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14); - } - - v8 = _mm_add_epi16(u8, u9); - v9 = _mm_sub_epi16(u8, u9); - v14 = _mm_sub_epi16(u15, u14); - v15 = _mm_add_epi16(u15, u14); - - { - const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); - butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13); - butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12); - } - - v10 = _mm_sub_epi16(u11, u10); - v11 = _mm_add_epi16(u11, u10); - v12 = _mm_add_epi16(u12, u13); - v13 = _mm_sub_epi16(u12, u13); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&v9, &v14, &stg4_4, &stg4_5); - butterfly_self(&v10, &v13, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(v8, v11); - out[1] = _mm_add_epi16(v9, v10); - out[6] = _mm_add_epi16(v14, v13); - out[7] = _mm_add_epi16(v15, v12); - - out[2] = _mm_sub_epi16(v9, v10); - out[3] = _mm_sub_epi16(v8, v11); - out[4] = _mm_sub_epi16(v15, v12); - out[5] = _mm_sub_epi16(v14, v13); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0); - butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0); - } -} - -// For each 8x32 block __m128i in[32], -// Input with index, 0, 4, 8, 12, 16, 20, 24, 28 -// output pixels: 0-7 in __m128i in[32] -static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/, - __m128i *out /*out[8]*/) { - __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_ - __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_ - - { - const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); - butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7); - butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6); - } - - v4 = _mm_add_epi16(u4, u5); - v5 = _mm_sub_epi16(u4, u5); - v6 = _mm_sub_epi16(u7, u6); - v7 = _mm_add_epi16(u7, u6); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); - butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6); - - butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1); - butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3); - } - - v0 = _mm_add_epi16(u0, u3); - v1 = _mm_add_epi16(u1, u2); - v2 = _mm_sub_epi16(u1, u2); - v3 = _mm_sub_epi16(u0, u3); - - out[0] = _mm_add_epi16(v0, v7); - out[1] = _mm_add_epi16(v1, v6); - out[2] = _mm_add_epi16(v2, v5); - out[3] = _mm_add_epi16(v3, v4); - out[4] = _mm_sub_epi16(v3, v4); - out[5] = _mm_sub_epi16(v2, v5); - out[6] = _mm_sub_epi16(v1, v6); - out[7] = _mm_sub_epi16(v0, v7); -} - -// For each 8x32 block __m128i in[32], -// Input with odd index, -// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 -// output pixels: 16-23, 24-31 in __m128i in[32] -// We avoid hide an offset, 16, inside this function. So we output 0-15 into -// array out[16] -static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/, - __m128i *out /*out[16]*/) { - __m128i v16, v17, v18, v19, v20, v21, v22, v23; - __m128i v24, v25, v26, v27, v28, v29, v30, v31; - __m128i u16, u17, u18, u19, u20, u21, u22, u23; - __m128i u24, u25, u26, u27, u28, u29, u30, u31; - - { - const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); - butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31); - butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30); - butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29); - butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28); - - butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27); - butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26); - - butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25); - butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24); - } - - v16 = _mm_add_epi16(u16, u17); - v17 = _mm_sub_epi16(u16, u17); - v18 = _mm_sub_epi16(u19, u18); - v19 = _mm_add_epi16(u19, u18); - - v20 = _mm_add_epi16(u20, u21); - v21 = _mm_sub_epi16(u20, u21); - v22 = _mm_sub_epi16(u23, u22); - v23 = _mm_add_epi16(u23, u22); - - v24 = _mm_add_epi16(u24, u25); - v25 = _mm_sub_epi16(u24, u25); - v26 = _mm_sub_epi16(u27, u26); - v27 = _mm_add_epi16(u27, u26); - - v28 = _mm_add_epi16(u28, u29); - v29 = _mm_sub_epi16(u28, u29); - v30 = _mm_sub_epi16(u31, u30); - v31 = _mm_add_epi16(u31, u30); - - { - const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); - const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); - butterfly_self(&v17, &v30, &stg3_4, &stg3_5); - butterfly_self(&v18, &v29, &stg3_6, &stg3_4); - butterfly_self(&v21, &v26, &stg3_8, &stg3_9); - butterfly_self(&v22, &v25, &stg3_10, &stg3_8); - } - - u16 = _mm_add_epi16(v16, v19); - u17 = _mm_add_epi16(v17, v18); - u18 = _mm_sub_epi16(v17, v18); - u19 = _mm_sub_epi16(v16, v19); - u20 = _mm_sub_epi16(v23, v20); - u21 = _mm_sub_epi16(v22, v21); - u22 = _mm_add_epi16(v22, v21); - u23 = _mm_add_epi16(v23, v20); - - u24 = _mm_add_epi16(v24, v27); - u25 = _mm_add_epi16(v25, v26); - u26 = _mm_sub_epi16(v25, v26); - u27 = _mm_sub_epi16(v24, v27); - - u28 = _mm_sub_epi16(v31, v28); - u29 = _mm_sub_epi16(v30, v29); - u30 = _mm_add_epi16(v29, v30); - u31 = _mm_add_epi16(v28, v31); - - { - const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - butterfly_self(&u18, &u29, &stg4_4, &stg4_5); - butterfly_self(&u19, &u28, &stg4_4, &stg4_5); - butterfly_self(&u20, &u27, &stg4_6, &stg4_4); - butterfly_self(&u21, &u26, &stg4_6, &stg4_4); - } - - out[0] = _mm_add_epi16(u16, u23); - out[1] = _mm_add_epi16(u17, u22); - out[2] = _mm_add_epi16(u18, u21); - out[3] = _mm_add_epi16(u19, u20); - out[4] = _mm_sub_epi16(u19, u20); - out[5] = _mm_sub_epi16(u18, u21); - out[6] = _mm_sub_epi16(u17, u22); - out[7] = _mm_sub_epi16(u16, u23); - - out[8] = _mm_sub_epi16(u31, u24); - out[9] = _mm_sub_epi16(u30, u25); - out[10] = _mm_sub_epi16(u29, u26); - out[11] = _mm_sub_epi16(u28, u27); - out[12] = _mm_add_epi16(u27, u28); - out[13] = _mm_add_epi16(u26, u29); - out[14] = _mm_add_epi16(u25, u30); - out[15] = _mm_add_epi16(u24, u31); - - { - const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); - const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); - butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0); - butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0); - butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0); - butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0); - } -} - -static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/, - __m128i *out /*out[32]*/) { - __m128i temp[16]; - idct32_full_8x32_quarter_1(in, temp); - idct32_full_8x32_quarter_2(in, &temp[8]); - add_sub_butterfly(temp, out, 16); -} - -static void idct32_full_8x32(const __m128i *in /*in[32]*/, - __m128i *out /*out[32]*/) { - __m128i temp[32]; - idct32_full_8x32_quarter_1_2(in, temp); - idct32_full_8x32_quarter_3_4(in, &temp[16]); - add_sub_butterfly(temp, out, 32); -} - -static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { - int i; - for (i = 0; i < 8; ++i) { - in[i] = load_input_data(input); - in[i + 8] = load_input_data(input + 8); - in[i + 16] = load_input_data(input + 16); - in[i + 24] = load_input_data(input + 24); - input += 32; - } -} - -void aom_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, - int stride) { - __m128i col[128], in[32]; - int i, j; - - // rows - for (i = 0; i < 4; ++i) { - load_buffer_8x32(input, in); - input += 32 << 3; - - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(in, in); - array_transpose_8x8(in + 8, in + 8); - array_transpose_8x8(in + 16, in + 16); - array_transpose_8x8(in + 24, in + 24); - - idct32_full_8x32(in, col + (i << 5)); - } - - // columns - for (i = 0; i < 4; ++i) { - j = i << 3; - // Transpose 32x8 block to 8x32 block - array_transpose_8x8(col + j, in); - array_transpose_8x8(col + j + 32, in + 8); - array_transpose_8x8(col + j + 64, in + 16); - array_transpose_8x8(col + j + 96, in + 24); - - idct32_full_8x32(in, in); - store_buffer_8x32(in, dest, stride); - dest += 8; - } -} diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm index f0668e6f3..0bc841a7a 100644 --- a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm +++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm @@ -85,15 +85,10 @@ SECTION .text INIT_XMM sse2 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride -%if CONFIG_HIGHBITDEPTH mova m0, [inputq + 0] packssdw m0, [inputq + 16] mova m1, [inputq + 32] packssdw m1, [inputq + 48] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] -%endif psraw m0, 2 psraw m1, 2 diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c new file mode 100644 index 000000000..c3c88245a --- /dev/null +++ b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 4); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; i += 4) { + __m128i x0 = xx_loadl_32(a + 0 * a_stride); + __m128i x1 = xx_loadl_32(a + 1 * a_stride); + __m128i x2 = xx_loadl_32(a + 2 * a_stride); + __m128i x3 = xx_loadl_32(a + 3 * a_stride); + __m128i x_lo = _mm_unpacklo_epi32(x0, x1); + __m128i x_hi = _mm_unpacklo_epi32(x2, x3); + + __m128i x = _mm_unpacklo_epi64(x_lo, x_hi); + + x0 = xx_loadl_32(b + 0 * b_stride); + x1 = xx_loadl_32(b + 1 * b_stride); + x2 = xx_loadl_32(b + 2 * b_stride); + x3 = xx_loadl_32(b + 3 * b_stride); + x_lo = _mm_unpacklo_epi32(x0, x1); + x_hi = _mm_unpacklo_epi32(x2, x3); + + __m128i y = _mm_unpacklo_epi64(x_lo, x_hi); + + __m128i sad4x4 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad4x4); + + a += 4 * a_stride; + b += 4 * b_stride; + } + + // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95]. + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 8); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; i += 2) { + __m128i x0 = xx_loadl_64(a + 0 * a_stride); + __m128i x1 = xx_loadl_64(a + 1 * a_stride); + + __m128i x = _mm_unpacklo_epi64(x0, x1); + + x0 = xx_loadl_64(b + 0 * b_stride); + x1 = xx_loadl_64(b + 1 * b_stride); + + __m128i y = _mm_unpacklo_epi64(x0, x1); + + __m128i sad8x2 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad8x2); + + a += 2 * a_stride; + b += 2 * b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i; + assert(width == 16); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + __m128i x = xx_loadu_128(a); + __m128i y = xx_loadu_128(b); + + __m128i sad16x1 = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad16x1); + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 32); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 2; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad32_half = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad32_half); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 64); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 4; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad64_quarter = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad64_quarter); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int i, j; + assert(width == 128); + (void)width; + + __m128i sad = _mm_setzero_si128(); + for (i = 0; i < height; ++i) { + for (j = 0; j < 8; ++j) { + __m128i x = xx_loadu_128(a + j * 16); + __m128i y = xx_loadu_128(b + j * 16); + + __m128i sad64_quarter = _mm_sad_epu8(x, y); + sad = _mm_add_epi32(sad, sad64_quarter); + } + + a += a_stride; + b += b_stride; + } + + const unsigned int res = + _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)); + + return res; +} + +#define jnt_sadMxN_sse2(m, n) \ + unsigned int aom_jnt_sad##m##x##n##_avg_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ + return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \ + } + +#define jnt_sadMxN_avx2(m, n) \ + unsigned int aom_jnt_sad##m##x##n##_avg_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint8_t comp_pred[m * n]; \ + aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ + return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n); \ + } + +/* clang-format off */ +jnt_sadMxN_sse2(128, 128) +jnt_sadMxN_sse2(128, 64) +jnt_sadMxN_sse2(64, 128) +jnt_sadMxN_sse2(64, 64) +jnt_sadMxN_sse2(64, 32) +jnt_sadMxN_sse2(32, 64) +jnt_sadMxN_sse2(32, 32) +jnt_sadMxN_sse2(32, 16) +jnt_sadMxN_sse2(16, 32) +jnt_sadMxN_sse2(16, 16) +jnt_sadMxN_sse2(16, 8) +jnt_sadMxN_sse2(8, 16) +jnt_sadMxN_sse2(8, 8) +jnt_sadMxN_sse2(8, 4) +jnt_sadMxN_sse2(4, 8) +jnt_sadMxN_sse2(4, 4) +jnt_sadMxN_sse2(4, 16) +jnt_sadMxN_sse2(16, 4) +jnt_sadMxN_sse2(8, 32) +jnt_sadMxN_sse2(32, 8) +jnt_sadMxN_sse2(16, 64) +jnt_sadMxN_sse2(64, 16) + /* clang-format on */ diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c new file mode 100644 index 000000000..9801e285c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // SSE2 +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow + // in computation using _mm_maddubs_epi16. + // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. + const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; + const __m128i r = _mm_set1_epi16(round); + const uint8_t f0 = filter[0] >> 1; + const uint8_t f1 = filter[1] >> 1; + const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, + f0, f1, f0, f1, f0, f1); + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + unsigned int i, j; + (void)pixel_step; + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + // load source + __m128i source_low = xx_loadl_64(a); + __m128i source_hi = _mm_setzero_si128(); + + // avoid load undefined memory + if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8); + __m128i source = _mm_unpacklo_epi64(source_low, source_hi); + + // shuffle to: + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] + __m128i res = _mm_maddubs_epi16(source_shuffle, filters); + + // round + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storeu_128(b, res); + + a += 8; + b += 8; + } + + a += src_pixels_per_line - output_width; + } + } else { + for (i = 0; i < output_height; ++i) { + // load source, only first 5 values are meaningful: + // { a[0], a[1], a[2], a[3], a[4], xxxx } + __m128i source = xx_loadl_64(a); + + // shuffle, up to the first 8 are useful + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + __m128i res = _mm_maddubs_epi16(source_shuffle, filters); + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storel_64(b, res); + + a += src_pixels_per_line; + b += output_width; + } + } +} + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + const int16_t round = (1 << FILTER_BITS) >> 1; + const __m128i r = _mm_set1_epi32(round); + const __m128i filters = + _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], + filter[1], filter[0], filter[1]); + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + const __m128i mask = + _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + // load source as: + // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } + __m128i source1 = xx_loadl_64(a); + __m128i source2 = xx_loadl_64(a + pixel_step); + __m128i source = _mm_unpacklo_epi64(source1, source2); + + // shuffle source to: + // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + // b[i] = a[i] * filter[0] + a[w + i] * filter[1] + __m128i res = _mm_madd_epi16(source_shuffle, filters); + + // round + res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); + + // shuffle to get each lower 8 bit of every 32 bit + res = _mm_shuffle_epi8(res, mask); + + xx_storel_32(b, res); + + a += 4; + b += 4; + } + + a += src_pixels_per_line - output_width; + } +} + +static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w, const __m128i *r, + void *const result) { + __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1); + __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w); + __m128i round_lo = _mm_add_epi16(mult_lo, *r); + __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS); + + __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1); + __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w); + __m128i round_hi = _mm_add_epi16(mult_hi, *r); + __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS); + + xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi)); +} + +void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const JNT_COMP_PARAMS *jcp_param) { + int i; + const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; + const uint8_t w1 = (uint8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + if (width >= 16) { + // Read 16 pixels one row at a time + assert(!(width & 15)); + for (i = 0; i < height; ++i) { + int j; + for (j = 0; j < width; j += 16) { + __m128i p0 = xx_loadu_128(ref); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 16; + } + ref += ref_stride - width; + } + } else if (width >= 8) { + // Read 8 pixels two row at a time + assert(!(width & 7)); + assert(!(width & 1)); + for (i = 0; i < height; i += 2) { + __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride); + __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 2 * ref_stride; + } + } else { + // Read 4 pixels four row at a time + assert(!(width & 3)); + assert(!(height & 3)); + for (i = 0; i < height; i += 4) { + const uint8_t *row0 = ref + 0 * ref_stride; + const uint8_t *row1 = ref + 1 * ref_stride; + const uint8_t *row2 = ref + 2 * ref_stride; + const uint8_t *row3 = ref + 3 * ref_stride; + + __m128i p0 = + _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1], + row1[2], row1[3], row2[0], row2[1], row2[2], row2[3], + row3[0], row3[1], row3[2], row3[3]); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + ref += 4 * ref_stride; + } + } +} + +void aom_jnt_comp_avg_upsampled_pred_ssse3( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const JNT_COMP_PARAMS *jcp_param) { + int n; + int i; + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride); + /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ + assert(!(width * height & 15)); + n = width * height >> 4; + + const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; + const uint8_t w1 = (uint8_t)jcp_param->bck_offset; + const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, + w1, w0, w1, w0); + const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1); + const __m128i r = + _mm_set_epi16(round, round, round, round, round, round, round, round); + + for (i = 0; i < n; i++) { + __m128i p0 = xx_loadu_128(comp_pred); + __m128i p1 = xx_loadu_128(pred); + + compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + + comp_pred += 16; + pred += 16; + } +} + +#define JNT_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \ + jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ + } + +JNT_SUBPIX_AVG_VAR(128, 128) +JNT_SUBPIX_AVG_VAR(128, 64) +JNT_SUBPIX_AVG_VAR(64, 128) +JNT_SUBPIX_AVG_VAR(64, 64) +JNT_SUBPIX_AVG_VAR(64, 32) +JNT_SUBPIX_AVG_VAR(32, 64) +JNT_SUBPIX_AVG_VAR(32, 32) +JNT_SUBPIX_AVG_VAR(32, 16) +JNT_SUBPIX_AVG_VAR(16, 32) +JNT_SUBPIX_AVG_VAR(16, 16) +JNT_SUBPIX_AVG_VAR(16, 8) +JNT_SUBPIX_AVG_VAR(8, 16) +JNT_SUBPIX_AVG_VAR(8, 8) +JNT_SUBPIX_AVG_VAR(8, 4) +JNT_SUBPIX_AVG_VAR(4, 8) +JNT_SUBPIX_AVG_VAR(4, 4) +JNT_SUBPIX_AVG_VAR(4, 16) +JNT_SUBPIX_AVG_VAR(16, 4) +JNT_SUBPIX_AVG_VAR(8, 32) +JNT_SUBPIX_AVG_VAR(32, 8) +JNT_SUBPIX_AVG_VAR(16, 64) +JNT_SUBPIX_AVG_VAR(64, 16) diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c index bf8150e2a..18862dd3e 100644 --- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c +++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c @@ -11,13 +11,14 @@ #include /* AVX2 */ -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_ports/mem.h" -void aom_lpf_horizontal_edge_8_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { +void aom_lpf_horizontal_16_avx2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); @@ -368,7 +369,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 }; -void aom_lpf_horizontal_edge_16_avx2(unsigned char *s, int p, +void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c index 8343dbbed..f1eac233b 100644 --- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -11,7 +11,9 @@ #include // SSE2 -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" #include "aom_ports/emmintrin_compat.h" @@ -19,1047 +21,1016 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); } -#if CONFIG_PARALLEL_DEBLOCKING -// filter_mask and hev_mask -#define FILTER_HEV_MASK4 \ - do { \ - /* (abs(q1 - q0), abs(p1 - p0) */ \ - __m128i flat = abs_diff(q1p1, q0p0); \ - /* abs(p1 - q1), abs(p0 - q0) */ \ - const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ - __m128i abs_p0q0, abs_p1q1; \ - \ - /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ - hev = \ - _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ - hev = _mm_cmpgt_epi16(hev, thresh); \ - hev = _mm_packs_epi16(hev, hev); \ - \ - /* const int8_t mask = filter_mask2(*limit, *blimit, */ \ - /* p1, p0, q0, q1); */ \ - abs_p0q0 = \ - _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ - abs_p1q1 = \ - _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ - abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ - mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ - mask = _mm_unpacklo_epi64(mask, flat); \ - mask = _mm_subs_epu8(mask, limit); \ - mask = _mm_cmpeq_epi8(mask, zero); \ - mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ - } while (0) -#endif // CONFIG_PARALLEL_DEBLOCKING - -// filter_mask and hev_mask -#define FILTER_HEV_MASK \ - do { \ - /* (abs(q1 - q0), abs(p1 - p0) */ \ - __m128i flat = abs_diff(q1p1, q0p0); \ - /* abs(p1 - q1), abs(p0 - q0) */ \ - const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ - __m128i abs_p0q0, abs_p1q1, work; \ - \ - /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ - hev = \ - _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ - hev = _mm_cmpgt_epi16(hev, thresh); \ - hev = _mm_packs_epi16(hev, hev); \ - \ - /* const int8_t mask = filter_mask(*limit, *blimit, */ \ - /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ - abs_p0q0 = \ - _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ - abs_p1q1 = \ - _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ - abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ - abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ - /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ - mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ - /* abs(p3 - p2), abs(p2 - p1) */ \ - work = abs_diff(p3p2, p2p1); \ - flat = _mm_max_epu8(work, flat); \ - /* abs(q3 - q2), abs(q2 - q1) */ \ - work = abs_diff(q3q2, q2q1); \ - flat = _mm_max_epu8(work, flat); \ - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ - mask = _mm_unpacklo_epi64(mask, flat); \ - mask = _mm_subs_epu8(mask, limit); \ - mask = _mm_cmpeq_epi8(mask, zero); \ - mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ - } while (0) - -#define FILTER4 \ - do { \ - const __m128i t3t4 = \ - _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \ - const __m128i t80 = _mm_set1_epi8(0x80); \ - __m128i filter, filter2filter1, work; \ - \ - ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ - qs1qs0 = _mm_xor_si128(q1q0, t80); \ - \ - /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ - work = _mm_subs_epi8(ps1ps0, qs1qs0); \ - filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ - /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ - filter = _mm_subs_epi8(filter, work); \ - filter = _mm_subs_epi8(filter, work); \ - filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ - filter = _mm_and_si128(filter, mask); /* & mask */ \ - filter = _mm_unpacklo_epi64(filter, filter); \ - \ - /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ - /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ - filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ - filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ - filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ - filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ - filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ - filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ - \ - /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ - filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ - filter = _mm_unpacklo_epi8(filter, filter); \ - filter = _mm_srai_epi16(filter, 9); /* round */ \ - filter = _mm_packs_epi16(filter, filter); \ - filter = _mm_andnot_si128(hev, filter); \ - \ - hev = _mm_unpackhi_epi64(filter2filter1, filter); \ - filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ - \ - /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ - qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ - /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ - ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ - qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ - ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ - } while (0) +static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + *d0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + + *d1 = _mm_srli_si128(*d0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(*d0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(*d0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, + __m128i *d5, __m128i *d6, + __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + + *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + *d1 = _mm_srli_si128(ww0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(ww0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(ww0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + *d5 = _mm_srli_si128(ww1, + 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + *d6 = _mm_srli_si128(ww1, + 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + *d7 = _mm_srli_si128(ww1, + 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0, + __m128i *d1, __m128i *d2, + __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + // output + // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx + // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx + // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx + + __m128i w0, w1, w2, w3, w4, w5; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d1 = _mm_srli_si128(*d0, 8); + *d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + *d3 = _mm_srli_si128(*d2, 8); +} + +static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0d1, + __m128i *d2d3, __m128i *d4d5, + __m128i *d6d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d2d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w6 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + w7 = _mm_unpackhi_epi16( + w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + + *d4d5 = _mm_unpacklo_epi32( + w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + *d6d7 = _mm_unpackhi_epi32( + w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 +} + +static INLINE void transpose16x8_8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, + __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, + __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpacklo_epi8(*x8, *x9); + w9 = _mm_unpacklo_epi8(*x10, *x11); + w10 = _mm_unpacklo_epi8(*x12, *x13); + w11 = _mm_unpacklo_epi8(*x14, *x15); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0 = _mm_unpacklo_epi64(w6, w14); + *d1 = _mm_unpackhi_epi64(w6, w14); + *d2 = _mm_unpacklo_epi64(w7, w15); + *d3 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d4 = _mm_unpacklo_epi64(w6, w14); + *d5 = _mm_unpackhi_epi64(w6, w14); + *d6 = _mm_unpacklo_epi64(w7, w15); + *d7 = _mm_unpackhi_epi64(w7, w15); +} + +static INLINE void transpose8x16_16x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, + __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, + __m128i *d12d13, __m128i *d14d15) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpackhi_epi8(*x0, *x1); + w9 = _mm_unpackhi_epi8(*x2, *x3); + w10 = _mm_unpackhi_epi8(*x4, *x5); + w11 = _mm_unpackhi_epi8(*x6, *x7); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0d1 = _mm_unpacklo_epi64(w6, w14); + *d2d3 = _mm_unpackhi_epi64(w6, w14); + *d4d5 = _mm_unpacklo_epi64(w7, w15); + *d6d7 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d8d9 = _mm_unpacklo_epi64(w6, w14); + *d10d11 = _mm_unpackhi_epi64(w6, w14); + *d12d13 = _mm_unpacklo_epi64(w7, w15); + *d14d15 = _mm_unpackhi_epi64(w7, w15); +} + +static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, __m128i *ps1ps0) { + const __m128i t3t4 = + _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8(0x80); + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi64(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); + filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter = _mm_srai_epi16(filter, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + + hev1 = _mm_unpackhi_epi64(filter2filter1, filter); + filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); + + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void lpf_internal_4_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; + __m128i mask, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + + /* (abs(q1 - q0), abs(p1 - p0) */ + __m128i flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + + /* const int8_t mask = filter_mask2(*limit, *blimit, */ + /* p1, p0, q0, q1); */ + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi64(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); + + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = + const __m128i zero = _mm_setzero_si128(); + __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); - const __m128i ff = _mm_cmpeq_epi8(zero, zero); -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i p3p2, p2p1, q3q2, q2q1; -#endif // !CONFIG_PARALLEL_DEBLOCKING - __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0; - __m128i mask, hev; -#if !CONFIG_PARALLEL_DEBLOCKING - p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s - 4 * p))); -#endif // !CONFIG_PARALLEL_DEBLOCKING - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s + 0 * p))); -#if !CONFIG_PARALLEL_DEBLOCKING - q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); -#endif // !CONFIG_PARALLEL_DEBLOCKING - p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); - q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); -#if !CONFIG_PARALLEL_DEBLOCKING - p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); - q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); -#endif // !CONFIG_PARALLEL_DEBLOCKING -#if !CONFIG_PARALLEL_DEBLOCKING - FILTER_HEV_MASK; -#else // CONFIG_PARALLEL_DEBLOCKING - FILTER_HEV_MASK4; -#endif // !CONFIG_PARALLEL_DEBLOCKING - FILTER4; - -#if CONFIG_PARALLEL_DEBLOCKING - *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 8); - *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(ps1ps0); - - *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 8); - *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(qs1qs0); -#else - _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 - _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 - _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 -#endif + + __m128i qs1qs0, ps1ps0; + __m128i p1, p0, q0, q1; + + p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); + p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); + q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p)); + q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); + + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); + + xx_storel_32(s - 1 * p, ps1ps0); + xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8)); + xx_storel_32(s + 0 * p, qs1qs0); + xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8)); } void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i limit = - _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), - _mm_loadl_epi64((const __m128i *)_limit)); - const __m128i thresh = + __m128i p1p0, q1q0; + __m128i p1, p0, q0, q1; + + const __m128i zero = _mm_setzero_si128(); + __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + _mm_loadl_epi64((const __m128i *)_limit)); + __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); - const __m128i ff = _mm_cmpeq_epi8(zero, zero); + __m128i x0, x1, x2, x3; -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i p3p2, p2p1, q3q2, q2q1; -#endif // !CONFIG_PARALLEL_DEBLOCKING - __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0; - __m128i mask, hev; + __m128i d0, d1, d2, d3; + x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p)); - // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); - - // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); - - // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); - - // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), - _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); - - // Transpose 8x8 - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - p1p0 = _mm_unpacklo_epi16(q1q0, x1); - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - x0 = _mm_unpacklo_epi16(x2, x3); -#if !CONFIG_PARALLEL_DEBLOCKING - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - p3p2 = _mm_unpacklo_epi32(p1p0, x0); -#endif // !CONFIG_PARALLEL_DEBLOCKING - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - p1p0 = _mm_unpackhi_epi32(p1p0, x0); -#if !CONFIG_PARALLEL_DEBLOCKING - p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high -#endif // !CONFIG_PARALLEL_DEBLOCKING - p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high - - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - q1q0 = _mm_unpackhi_epi16(q1q0, x1); - // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - x2 = _mm_unpackhi_epi16(x2, x3); -#if !CONFIG_PARALLEL_DEBLOCKING - // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - q3q2 = _mm_unpackhi_epi32(q1q0, x2); -#endif // !CONFIG_PARALLEL_DEBLOCKING - // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - q1q0 = _mm_unpacklo_epi32(q1q0, x2); - - q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); - q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); - p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); -#if !CONFIG_PARALLEL_DEBLOCKING - p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); - q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); -#endif // !CONFIG_PARALLEL_DEBLOCKING -#if !CONFIG_PARALLEL_DEBLOCKING - FILTER_HEV_MASK; -#else // CONFIG_PARALLEL_DEBLOCKING - FILTER_HEV_MASK4; -#endif // !CONFIG_PARALLEL_DEBLOCKING - FILTER4; + transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1); + + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0); // Transpose 8x4 to 4x8 - // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 - // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 - // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 - ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8)); - // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 - x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); - // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 - ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); -#if !CONFIG_PARALLEL_DEBLOCKING - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); -#endif - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); - - *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); - ps1ps0 = _mm_srli_si128(ps1ps0, 4); - *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); -#if !CONFIG_PARALLEL_DEBLOCKING - *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); - qs1qs0 = _mm_srli_si128(qs1qs0, 4); - *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); -#endif + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); + + transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); + + xx_storel_32(s + 0 * p - 2, d0); + xx_storel_32(s + 1 * p - 2, d1); + xx_storel_32(s + 2 * p - 2, d2); + xx_storel_32(s + 3 * p - 2, d3); } -static INLINE void store_buffer_horz_8(const __m128i *x, int p, int num, - uint8_t *s) { -#if CONFIG_PARALLEL_DEBLOCKING - *(int32_t *)(s - (num + 1) * p) = _mm_cvtsi128_si32(*x); - const __m128i hi = _mm_srli_si128(*x, 8); - *(int32_t *)(s + num * p) = _mm_cvtsi128_si32(hi); -#else - _mm_storel_epi64((__m128i *)(s - (num + 1) * p), *x); - _mm_storeh_pi((__m64 *)(s + num * p), _mm_castsi128_ps(*x)); -#endif +static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { + xx_storel_32(s - (num + 1) * p, x); + xx_storel_32(s + num * p, _mm_srli_si128(x, 8)); } -void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - const __m128i zero = _mm_set1_epi16(0); +static AOM_FORCE_INLINE void lpf_internal_14_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); __m128i mask, hev, flat, flat2; - __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; __m128i abs_p1p0; - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); - q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); - p0q0 = _mm_shuffle_epi32(q0p0, 78); + p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1); + q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1); { - __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; - abs_p1p0 = abs_diff(q1p1, q0p0); + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + __m128i fe, ff, work; + abs_p1p0 = abs_diff(*q1p1, *q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); fe = _mm_set1_epi8(0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - abs_p0q0 = abs_diff(q0p0, p0q0); - abs_p1q1 = abs_diff(q1p1, p1q1); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, *thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); + // replicate for the further "merged variables" usage + mask = _mm_unpacklo_epi64(mask, mask); } - // lp filter + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0); + qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0); + // loopfilter done + + __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; + __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; + { + __m128i work; + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); - __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); - __m128i qs0 = _mm_xor_si128(p0q0, t80); - __m128i qs1 = _mm_xor_si128(p1q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; - __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - - filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, qs0ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 0xB); - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 0xB); - - // Filter1 >> 3 - filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); - qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - - // filt >> 1 - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), - filt); - filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); - qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); - // loopfilter done - - { - __m128i work; - flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); - q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); - - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); - q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); - flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); - - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); - q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); - work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; - __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - p7_16 = _mm_unpacklo_epi8(q7p7, zero); - p6_16 = _mm_unpacklo_epi8(q6p6, zero); - p5_16 = _mm_unpacklo_epi8(q5p5, zero); - p4_16 = _mm_unpacklo_epi8(q4p4, zero); - p3_16 = _mm_unpacklo_epi8(q3p3, zero); - p2_16 = _mm_unpacklo_epi8(q2p2, zero); - p1_16 = _mm_unpacklo_epi8(q1p1, zero); - p0_16 = _mm_unpacklo_epi8(q0p0, zero); - q0_16 = _mm_unpackhi_epi8(q0p0, zero); - q1_16 = _mm_unpackhi_epi8(q1p1, zero); - q2_16 = _mm_unpackhi_epi8(q2p2, zero); - q3_16 = _mm_unpackhi_epi8(q3p3, zero); - q4_16 = _mm_unpackhi_epi8(q4p4, zero); - q5_16 = _mm_unpackhi_epi8(q5p5, zero); - q6_16 = _mm_unpackhi_epi8(q6p6, zero); - q7_16 = _mm_unpackhi_epi8(q7p7, zero); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), - _mm_add_epi16(p4_16, p3_16)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), - _mm_add_epi16(q4_16, q3_16)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = - _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16( - four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); - flat2_q0p0 = _mm_packus_epi16(res_p, res_q); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); - - flat_q0p0 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(p7_16, p7_16); - sum_q7 = _mm_add_epi16(q7_16, q7_16); - sum_p3 = _mm_add_epi16(p3_16, p3_16); - sum_q3 = _mm_add_epi16(q3_16, q3_16); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); - flat2_q1p1 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); - flat_q1p1 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); - flat_q2p2 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); - flat2_q6p6 = _mm_packus_epi16(res_p, res_q); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); - - q2p2 = _mm_andnot_si128(flat, q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - q2p2 = _mm_or_si128(q2p2, flat_q2p2); - - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); - - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); - - q6p6 = _mm_andnot_si128(flat2, q6p6); - flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); - q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - store_buffer_horz_8(&q6p6, p, 6, s); - - q5p5 = _mm_andnot_si128(flat2, q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - store_buffer_horz_8(&q5p5, p, 5, s); - - q4p4 = _mm_andnot_si128(flat2, q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - store_buffer_horz_8(&q4p4, p, 4, s); - - q3p3 = _mm_andnot_si128(flat2, q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - store_buffer_horz_8(&q3p3, p, 3, s); - - q2p2 = _mm_andnot_si128(flat2, q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - store_buffer_horz_8(&q2p2, p, 2, s); - - q1p1 = _mm_andnot_si128(flat2, q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - store_buffer_horz_8(&q1p1, p, 1, s); - - q0p0 = _mm_andnot_si128(flat2, q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - store_buffer_horz_8(&q0p0, p, 0, s); + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; + __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pixelFilter_p, pixelFilter_q; + __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; + __m128i sum_p6, sum_q6; + __m128i sum_p3, sum_q3, res_p, res_q; + + p6_16 = _mm_unpacklo_epi8(*q6p6, zero); + p5_16 = _mm_unpacklo_epi8(*q5p5, zero); + p4_16 = _mm_unpacklo_epi8(*q4p4, zero); + p3_16 = _mm_unpacklo_epi8(*q3p3, zero); + p2_16 = _mm_unpacklo_epi8(*q2p2, zero); + p1_16 = _mm_unpacklo_epi8(*q1p1, zero); + p0_16 = _mm_unpacklo_epi8(*q0p0, zero); + q0_16 = _mm_unpackhi_epi8(*q0p0, zero); + q1_16 = _mm_unpackhi_epi8(*q1p1, zero); + q2_16 = _mm_unpackhi_epi8(*q2p2, zero); + q3_16 = _mm_unpackhi_epi8(*q3p3, zero); + q4_16 = _mm_unpackhi_epi8(*q4p4, zero); + q5_16 = _mm_unpackhi_epi8(*q5p5, zero); + q6_16 = _mm_unpackhi_epi8(*q6p6, zero); + pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16)); + pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16)); + + pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); + pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); + + pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); + pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); + pixelFilter_p = + _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); + pixetFilter_p2p1p0 = _mm_add_epi16( + four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, + _mm_add_epi16(_mm_add_epi16(p6_16, p0_16), + _mm_add_epi16(p1_16, q0_16))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixelFilter_p, + _mm_add_epi16(_mm_add_epi16(q6_16, q0_16), + _mm_add_epi16(p0_16, q1_16))), + 4); + flat2_q0p0 = _mm_packus_epi16(res_p, res_q); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); + + flat_q0p0 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(p6_16, p6_16); + sum_q6 = _mm_add_epi16(q6_16, q6_16); + sum_p3 = _mm_add_epi16(p3_16, p3_16); + sum_q3 = _mm_add_epi16(q3_16, q3_16); + + pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))), + 4); + flat2_q1p1 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); + flat_q1p1 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); + pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); + res_q = _mm_srli_epi16( + _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); + flat_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); } -} + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -static INLINE __m128i filter_add2_sub2(const __m128i *const total, - const __m128i *const a1, - const __m128i *const a2, - const __m128i *const s1, - const __m128i *const s2) { - __m128i x = _mm_add_epi16(*a1, *total); - x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); - return x; -} + flat = _mm_shuffle_epi32(flat, 68); + flat2 = _mm_shuffle_epi32(flat2, 68); + + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); -static INLINE __m128i filter8_mask(const __m128i *const flat, - const __m128i *const other_filt, - const __m128i *const f8_lo, - const __m128i *const f8_hi) { - const __m128i f8 = - _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3)); - const __m128i result = _mm_and_si128(*flat, f8); - return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); } -static INLINE __m128i filter16_mask(const __m128i *const flat, - const __m128i *const other_filt, - const __m128i *const f_lo, - const __m128i *const f_hi) { - const __m128i f = - _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4)); - const __m128i result = _mm_and_si128(*flat, f); - return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); +void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), + _mm_cvtsi32_si128(*(int *)(s + 4 * p))); + q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), + _mm_cvtsi32_si128(*(int *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), + _mm_cvtsi32_si128(*(int *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), + _mm_cvtsi32_si128(*(int *)(s + 1 * p))); + + q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), + _mm_cvtsi32_si128(*(int *)(s - 0 * p))); + + q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), + _mm_cvtsi32_si128(*(int *)(s + 5 * p))); + + q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), + _mm_cvtsi32_si128(*(int *)(s + 6 * p))); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + store_buffer_horz_8(q0p0, p, 0, s); + store_buffer_horz_8(q1p1, p, 1, s); + store_buffer_horz_8(q2p2, p, 2, s); + store_buffer_horz_8(q3p3, p, 3, s); + store_buffer_horz_8(q4p4, p, 4, s); + store_buffer_horz_8(q5p5, p, 5, s); } -typedef enum { FOUR_PIXELS, EIGHT_PIXELS, SIXTEEN_PIXELS } PixelOutput; +static AOM_FORCE_INLINE void lpf_internal_6_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16; + __m128i ps1ps0, qs1qs0; -static INLINE void store_buffer_horz_16(PixelOutput pixel_num, const __m128i *x, - int p, int offset, uint8_t *s) { - int i; - if (pixel_num == FOUR_PIXELS) { - for (i = 13; i >= 0; i--) { - *(int32_t *)(s - (i - offset) * p) = _mm_cvtsi128_si32(x[i]); - } - } - if (pixel_num == EIGHT_PIXELS) { - for (i = 13; i >= 0; i--) { - _mm_storel_epi64((__m128i *)(s - (i - offset) * p), x[i]); - } - } - if (pixel_num == SIXTEEN_PIXELS) { - for (i = 13; i >= 0; i--) { - _mm_storeu_si128((__m128i *)(s - (i - offset) * p), x[i]); - } - } -} + q2p2 = _mm_unpacklo_epi64(*p2, *q2); + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); -static INLINE void lpf_horz_edge_16_internal(PixelOutput pixel_num, - unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - __m128i mask, hev, flat, flat2; - __m128i p7, p6, p5; - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; - __m128i q5, q6, q7; - - __m128i op2, op1, op0, oq0, oq1, oq2; - - __m128i max_abs_p1p0q1q0; - - p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); - p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); - p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); - p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); - q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); - q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); + *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); { - const __m128i abs_p1p0 = abs_diff(p1, p0); - const __m128i abs_q1q0 = abs_diff(q1, q0); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(zero, zero); - __m128i abs_p0q0 = abs_diff(p0, q0); - __m128i abs_p1q1 = abs_diff(p1, q1); - __m128i work; - max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); + mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); + + work = abs_diff(q2p2, q1p1); mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - } + // replicate for the further "merged variables" usage + mask = _mm_unpacklo_epi64(mask, mask); - { - __m128i work; - work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); - flat = _mm_max_epu8(work, max_abs_p1p0q1q0); - work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); - flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); - flat2 = _mm_max_epu8(work, flat2); - work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); - flat2 = _mm_max_epu8(work, flat2); - work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // filter4 + // 5 tap filter { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - const __m128i ff = _mm_cmpeq_epi8(t4, t4); - - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - op1 = _mm_xor_si128(p1, t80); - op0 = _mm_xor_si128(p0, t80); - oq0 = _mm_xor_si128(q0, t80); - oq1 = _mm_xor_si128(q1, t80); - - hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); - - work_a = _mm_subs_epi8(oq0, op0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); - - // Filter2 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); - oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); - // loopfilter done - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // filter8 - { - const __m128i four = _mm_set1_epi16(4); - const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); - const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); - const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); - const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); - const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); - const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); - const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); - const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); - - const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); - const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); - const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); - const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); - const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); - const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); - const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); - const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); - __m128i f8_lo, f8_hi; - - f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), - _mm_add_epi16(p3_lo, p2_lo)); - f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), - _mm_add_epi16(p2_lo, p1_lo)); - f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); - - f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), - _mm_add_epi16(p3_hi, p2_hi)); - f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), - _mm_add_epi16(p2_hi, p1_hi)); - f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); - - op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); - op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); - op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); - oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); - oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); - - f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); - f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); - oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); - const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); - const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); - const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); - const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); - const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); - const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); - const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); - const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); - const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); - const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); - const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); - const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); - const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); - const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); - const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); - - const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); - const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); - const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); - const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); - const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); - const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); - const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); - const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); - const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); - const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); - const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); - const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); - const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); - const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); - const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); - const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); - - __m128i f_lo; - __m128i f_hi; - - f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 - f_lo = - _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo)); - f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), - _mm_add_epi16(p2_lo, p1_lo)); - f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); - f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); - - f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 - f_hi = - _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi)); - f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), - _mm_add_epi16(p2_hi, p1_hi)); - f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); - f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); - - __m128i x[14]; - x[13] = filter16_mask(&flat2, &p6, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); - x[12] = filter16_mask(&flat2, &p5, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); - x[11] = filter16_mask(&flat2, &p4, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); - x[10] = filter16_mask(&flat2, &p3, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); - x[9] = filter16_mask(&flat2, &op2, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); - x[8] = filter16_mask(&flat2, &op1, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); - x[7] = filter16_mask(&flat2, &op0, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); - x[6] = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); - x[5] = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); - x[4] = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); - x[3] = filter16_mask(&flat2, &q3, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); - x[2] = filter16_mask(&flat2, &q4, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); - x[1] = filter16_mask(&flat2, &q5, &f_lo, &f_hi); - - f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); - f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); - x[0] = filter16_mask(&flat2, &q6, &f_lo, &f_hi); - - store_buffer_horz_16(pixel_num, x, p, 6, s); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + const __m128i four = _mm_set1_epi16(4); + + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + + // op1 + workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16), + _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2 + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), + p2_16); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), + 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_shft1 = _mm_srli_epi16(workp_a, 3); + + flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16), + p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_add_epi16(q1_16, q2_16); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16), + p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), + 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); } + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + + qs1qs0 = _mm_andnot_si128(flat, qs1qs0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, ps1ps0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); } -void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, +void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); - const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); - const __m128i limit = _mm_load_si128((const __m128i *)_limit); - const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i p2, p1, p0, q0, q1, q2; + __m128i p1p0, q1q0; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p)); + p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); + p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); + q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p)); + q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); + q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p)); + + lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + xx_storel_32(s - 1 * p, p1p0); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); + xx_storel_32(s + 0 * p, q1q0); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); +} + +void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i p2, p1, p0, q0, q1, q2; + __m128i p1p0, q1q0; + + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + + lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); +} + +static AOM_FORCE_INLINE void lpf_internal_8_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); __m128i mask, hev, flat; - __m128i p3, p2, p1, p0, q0, q1, q2, q3; - __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_a, op2, oq2; - q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), - _mm_loadl_epi64((__m128i *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), - _mm_loadl_epi64((__m128i *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), - _mm_loadl_epi64((__m128i *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), - _mm_loadl_epi64((__m128i *)(s - 0 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - p0q0 = _mm_shuffle_epi32(q0p0, 78); + q3p3 = _mm_unpacklo_epi64(*p3, *q3); + q2p2 = _mm_unpacklo_epi64(*p2, *q2); + q1p1 = _mm_unpacklo_epi64(*p1, *q1); + q0p0 = _mm_unpacklo_epi64(*p0, *q0); + + p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); + q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); { // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - abs_p0q0 = abs_diff(q0p0, p0q0); - abs_p1q1 = abs_diff(q1p1, p1q1); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 8); + abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); + hev = _mm_subs_epu8(flat, *thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi64(hev, hev); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); @@ -1067,424 +1038,215 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); + mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); + // replicate for the further "merged variables" usage + mask = _mm_unpacklo_epi64(mask, mask); // flat_mask4 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi64(flat, flat); } + // filter8 { const __m128i four = _mm_set1_epi16(4); - unsigned char *src = s; - { - __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - - workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[0], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[0], - _mm_packus_epi16(workp_shft, workp_shft)); - } - } - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i ps1 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); - const __m128i ps0 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80); - const __m128i qs0 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80); - const __m128i qs1 = - _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 11); - filter1 = _mm_packs_epi16(filter1, filter1); - - // Filter2 >> 3 - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 11); - filter2 = _mm_packs_epi16(filter2, zero); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - filt = _mm_unpacklo_epi8(zero, filt); - filt = _mm_srai_epi16(filt, 9); - filt = _mm_packs_epi16(filt, zero); - - filt = _mm_andnot_si128(hev, filt); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q0 = _mm_loadl_epi64((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, work_a); - q0 = _mm_and_si128(flat, q0); - q0 = _mm_or_si128(work_a, q0); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - q1 = _mm_loadl_epi64((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, work_a); - q1 = _mm_and_si128(flat, q1); - q1 = _mm_or_si128(work_a, q1); - - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_loadl_epi64((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - q2 = _mm_or_si128(work_a, q2); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p0 = _mm_loadl_epi64((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, work_a); - p0 = _mm_and_si128(flat, p0); - p0 = _mm_or_si128(work_a, p0); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - p1 = _mm_loadl_epi64((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, work_a); - p1 = _mm_and_si128(flat, p1); - p1 = _mm_or_si128(work_a, p1); - - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_loadl_epi64((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - p2 = _mm_or_si128(work_a, p2); - -#if CONFIG_PARALLEL_DEBLOCKING - *(int32_t *)(s - 3 * p) = _mm_cvtsi128_si32(p2); - *(int32_t *)(s - 2 * p) = _mm_cvtsi128_si32(p1); - *(int32_t *)(s - 1 * p) = _mm_cvtsi128_si32(p0); - *(int32_t *)(s + 0 * p) = _mm_cvtsi128_si32(q0); - *(int32_t *)(s + 1 * p) = _mm_cvtsi128_si32(q1); - *(int32_t *)(s + 2 * p) = _mm_cvtsi128_si32(q2); -#else - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); -#endif + + __m128i workp_a, workp_b, workp_shft0, workp_shft1; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + op2 = _mm_packus_epi16(workp_shft0, workp_shft0); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + oq2 = _mm_packus_epi16(workp_shft1, workp_shft1); } + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + + qs1qs0 = _mm_andnot_si128(flat, qs1qs0); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, ps1ps0); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + + work_a = _mm_andnot_si128(flat, *q2); + q2_16 = _mm_and_si128(flat, oq2); + *q2_out = _mm_or_si128(work_a, q2_16); + + work_a = _mm_andnot_si128(flat, *p2); + p2_16 = _mm_and_si128(flat, op2); + *p2_out = _mm_or_si128(work_a, p2_16); } -void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { -#if CONFIG_PARALLEL_DEBLOCKING - lpf_horz_edge_16_internal(FOUR_PIXELS, s, p, _blimit, _limit, _thresh); -#else - lpf_horz_edge_16_internal(SIXTEEN_PIXELS, s, p, _blimit, _limit, _thresh); -#endif +void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0, p2_out, q2_out; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p)); + p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p)); + p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p)); + p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p)); + q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p)); + q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p)); + q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p)); + q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p)); + + lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &p2_out, &q2_out, &blimit, &limit, &thresh); + + xx_storel_32(s - 1 * p, p1p0); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); + xx_storel_32(s + 0 * p, q1q0); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); + xx_storel_32(s - 3 * p, p2_out); + xx_storel_32(s + 2 * p, q2_out); } -void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, - const uint8_t *_limit0, - const uint8_t *_thresh0, - const uint8_t *_blimit1, - const uint8_t *_limit1, - const uint8_t *_thresh1) { - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - const __m128i zero = _mm_set1_epi16(0); - const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), +void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), _mm_load_si128((const __m128i *)_blimit1)); - const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), - _mm_load_si128((const __m128i *)_limit1)); - const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + __m128i thresh = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), _mm_load_si128((const __m128i *)_thresh1)); - __m128i mask, hev, flat; - __m128i p3, p2, p1, p0, q0, q1, q2, q3; - - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - { - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); - __m128i work; + q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)), + _mm_loadl_epi64((__m128i *)(s + 4 * p))); + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); - // filter_mask and hev_mask - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); + q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)), + _mm_loadl_epi64((__m128i *)(s + 5 * p))); + + q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)), + _mm_loadl_epi64((__m128i *)(s + 6 * p))); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8)); + _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8)); + _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8)); + _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); + _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8)); + _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); + _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8)); + _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); + _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8)); +} - // flat_mask4 - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - } - { - const __m128i four = _mm_set1_epi16(4); - unsigned char *src = s; - int i = 0; - - do { - __m128i workp_a, workp_b, workp_shft; - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - - workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], - _mm_packus_epi16(workp_shft, workp_shft)); - - src += 8; - } while (++i < 2); - } - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - const __m128i ps1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - const __m128i ps0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - const __m128i qs0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - const __m128i qs1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - - // Filter2 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - - filt = _mm_andnot_si128(hev, filt); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q0 = _mm_load_si128((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, work_a); - q0 = _mm_and_si128(flat, q0); - q0 = _mm_or_si128(work_a, q0); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - q1 = _mm_load_si128((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, work_a); - q1 = _mm_and_si128(flat, q1); - q1 = _mm_or_si128(work_a, q1); - - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_load_si128((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - q2 = _mm_or_si128(work_a, q2); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p0 = _mm_load_si128((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, work_a); - p0 = _mm_and_si128(flat, p0); - p0 = _mm_or_si128(work_a, p0); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - p1 = _mm_load_si128((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, work_a); - p1 = _mm_and_si128(flat, p1); - p1 = _mm_or_si128(work_a, p1); - - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_load_si128((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - p2 = _mm_or_si128(work_a, p2); - - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); - } +void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i p2, p1, p0, q0, q1, q2, p3, q3; + __m128i q1q0, p1p0, p2_out, q2_out; + + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + + lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &p2_out, &q2_out, &blimit, &limit, &thresh); + + _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out); } void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, @@ -1494,449 +1256,405 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, const unsigned char *_blimit1, const unsigned char *_limit1, const unsigned char *_thresh1) { + __m128i p1, p0, q0, q1; + __m128i qs1qs0, ps1ps0; + + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + + const __m128i zero = _mm_setzero_si128(); const __m128i blimit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), _mm_load_si128((const __m128i *)_blimit1)); const __m128i limit = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), _mm_load_si128((const __m128i *)_limit1)); - const __m128i thresh = - _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), - _mm_load_si128((const __m128i *)_thresh1)); - const __m128i zero = _mm_set1_epi16(0); -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i p3, p2, q2, q3; -#endif // !CONFIG_PARALLEL_DEBLOCKING - __m128i p1, p0, q0, q1; - __m128i mask, hev, flat; -#if !CONFIG_PARALLEL_DEBLOCKING - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); -#endif // !CONFIG_PARALLEL_DEBLOCKING - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); -#if !CONFIG_PARALLEL_DEBLOCKING - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); -#endif // !CONFIG_PARALLEL_DEBLOCKING - // filter_mask and hev_mask - { - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); -#if !CONFIG_PARALLEL_DEBLOCKING - __m128i work; -#endif // !CONFIG_PARALLEL_DEBLOCKING - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); -#if !CONFIG_PARALLEL_DEBLOCKING - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); -#endif // !CONFIG_PARALLEL_DEBLOCKING - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - // filter4 - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - const __m128i ps1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - const __m128i ps0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - const __m128i qs0 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - const __m128i qs1 = - _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - // (aom_filter + 3 * (qs0 - ps0)) & mask - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - // Filter1 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - - // Filter2 >> 3 - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - - // filt >> 1 - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - - filt = _mm_andnot_si128(hev, filt); - - q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - } -} + __m128i l = _mm_unpacklo_epi64(blimit, limit); -static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, - int in_p, unsigned char *out, int out_p) { - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i x8, x9, x10, x11, x12, x13, x14, x15; - - // 2-way interleave w/hoisting of unpacks - x0 = _mm_loadl_epi64((__m128i *)in0); // 1 - x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3 - x0 = _mm_unpacklo_epi8(x0, x1); // 1 - - x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5 - x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7 - x1 = _mm_unpacklo_epi8(x2, x3); // 2 - - x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9 - x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11 - x2 = _mm_unpacklo_epi8(x4, x5); // 3 - - x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13 - x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15 - x3 = _mm_unpacklo_epi8(x6, x7); // 4 - x4 = _mm_unpacklo_epi16(x0, x1); // 9 - - x8 = _mm_loadl_epi64((__m128i *)in1); // 2 - x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4 - x8 = _mm_unpacklo_epi8(x8, x9); // 5 - x5 = _mm_unpacklo_epi16(x2, x3); // 10 - - x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6 - x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8 - x9 = _mm_unpacklo_epi8(x10, x11); // 6 - - x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10 - x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12 - x10 = _mm_unpacklo_epi8(x12, x13); // 7 - x12 = _mm_unpacklo_epi16(x8, x9); // 11 - - x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14 - x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16 - x11 = _mm_unpacklo_epi8(x14, x15); // 8 - x13 = _mm_unpacklo_epi16(x10, x11); // 12 - - x6 = _mm_unpacklo_epi32(x4, x5); // 13 - x7 = _mm_unpackhi_epi32(x4, x5); // 14 - x14 = _mm_unpacklo_epi32(x12, x13); // 15 - x15 = _mm_unpackhi_epi32(x12, x13); // 16 + __m128i thresh0 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); - // Store first 4-line result - _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); - _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); + __m128i thresh1 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); - x4 = _mm_unpackhi_epi16(x0, x1); - x5 = _mm_unpackhi_epi16(x2, x3); - x12 = _mm_unpackhi_epi16(x8, x9); - x13 = _mm_unpackhi_epi16(x10, x11); + __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); - x6 = _mm_unpacklo_epi32(x4, x5); - x7 = _mm_unpackhi_epi32(x4, x5); - x14 = _mm_unpacklo_epi32(x12, x13); - x15 = _mm_unpackhi_epi32(x12, x13); + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); - // Store second 4-line result - _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); - _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); - _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); + _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); + _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8)); + _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); + _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8)); } -#if CONFIG_PARALLEL_DEBLOCKING -#define movq(p) _mm_loadl_epi64((const __m128i *)(p)) -#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1) -#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1) -#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1) -#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r) -#define pshufd(r, imm) _mm_shuffle_epi32(r, imm) -enum { ROTATE_DWORD_RIGHT = 0x39 }; -static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride, - const uint8_t *pSrc, - const ptrdiff_t srcStride) { - for (uint32_t idx = 0; idx < 2; idx += 1) { - __m128i r0, r1, r2, r3; - // load data - r0 = movq(pSrc); - r1 = movq(pSrc + srcStride); - r2 = movq(pSrc + srcStride * 2); - r3 = movq(pSrc + srcStride * 3); - // transpose - r0 = punpcklbw(r0, r1); - r2 = punpcklbw(r2, r3); - r1 = punpckhwd(r0, r2); - r0 = punpcklwd(r0, r2); - // store data - movd(pDst, r0); - r0 = pshufd(r0, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride, r0); - r0 = pshufd(r0, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride * 2, r0); - r0 = pshufd(r0, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride * 3, r0); - movd(pDst + dstStride * 4, r1); - r1 = pshufd(r1, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride * 5, r1); - r1 = pshufd(r1, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride * 6, r1); - r1 = pshufd(r1, ROTATE_DWORD_RIGHT); - movd(pDst + dstStride * 7, r1); - // advance the pointers - pDst += dstStride * 8; - pSrc += 8; - } -} - -#endif // CONFIG_PARALLEL_DEBLOCKING -static INLINE void transpose(unsigned char *src[], int in_p, - unsigned char *dst[], int out_p, - int num_8x8_to_transpose) { - int idx8x8 = 0; +void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i p0, q0, q1, p1; __m128i x0, x1, x2, x3, x4, x5, x6, x7; - do { - unsigned char *in = src[idx8x8]; - unsigned char *out = dst[idx8x8]; - - x0 = - _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 - x1 = - _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 - // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - x0 = _mm_unpacklo_epi8(x0, x1); - - x2 = - _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 - x3 = - _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 - // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - x1 = _mm_unpacklo_epi8(x2, x3); - - x4 = - _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 - x5 = - _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 - // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - x2 = _mm_unpacklo_epi8(x4, x5); - - x6 = - _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 - x7 = - _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 - // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - x3 = _mm_unpacklo_epi8(x6, x7); - - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - x4 = _mm_unpacklo_epi16(x0, x1); - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - x5 = _mm_unpacklo_epi16(x2, x3); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 0 * out_p), - _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 - _mm_storeh_pd((double *)(out + 1 * out_p), - _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 2 * out_p), - _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 - _mm_storeh_pd((double *)(out + 3 * out_p), - _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 - - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - x4 = _mm_unpackhi_epi16(x0, x1); - // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - x5 = _mm_unpackhi_epi16(x2, x3); - // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 4 * out_p), - _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 - _mm_storeh_pd((double *)(out + 5 * out_p), - _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - x7 = _mm_unpackhi_epi32(x4, x5); - - _mm_storel_pd((double *)(out + 6 * out_p), - _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 - _mm_storeh_pd((double *)(out + 7 * out_p), - _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 - } while (++idx8x8 < num_8x8_to_transpose); -} + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i qs1qs0, ps1ps0; -void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); -#if !CONFIG_PARALLEL_DEBLOCKING - unsigned char *src[2]; - unsigned char *dst[2]; -#endif // !CONFIG_PARALLEL_DEBLOCKING - // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + const __m128i zero = _mm_setzero_si128(); + const __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); - // Loop filtering - aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, - blimit1, limit1, thresh1); -#if !CONFIG_PARALLEL_DEBLOCKING - src[0] = t_dst; - src[1] = t_dst + 8; - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; - - // Transpose back - transpose(src, 16, dst, p, 2); -#else // CONFIG_PARALLEL_DEBLOCKING - transpose16x4(s - 2, p, t_dst + 16 * 2, 16); -#endif // !CONFIG_PARALLEL_DEBLOCKING -} + __m128i l = _mm_unpacklo_epi64(blimit, limit); -void aom_lpf_vertical_8_sse2(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh) { - DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); - unsigned char *src[1]; - unsigned char *dst[1]; + __m128i thresh0 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero); - // Transpose 8x8 - src[0] = s - 4; - dst[0] = t_dst; + __m128i thresh1 = + _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero); - transpose(src, p, dst, 8, 1); + __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); - // Loop filtering - aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); + x0 = _mm_loadl_epi64((__m128i *)((s - 2))); + x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p)); - src[0] = t_dst; - dst[0] = s - 4; + transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0, + &q1); - // Transpose back - transpose(src, 8, dst, p, 1); -} + lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); -void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); - unsigned char *src[2]; - unsigned char *dst[2]; + p1 = _mm_srli_si128(ps1ps0, 8); + q1 = _mm_srli_si128(qs1qs0, 8); - // Transpose 8x16 - transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); + transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4, + &d5, &d6, &d7); - // Loop filtering - aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, - blimit1, limit1, thresh1); - src[0] = t_dst; - src[1] = t_dst + 8; + xx_storel_32((s - 2 + 0 * p), d0); + xx_storel_32((s - 2 + 1 * p), d1); + xx_storel_32((s - 2 + 2 * p), d2); + xx_storel_32((s - 2 + 3 * p), d3); + xx_storel_32((s - 2 + 4 * p), d4); + xx_storel_32((s - 2 + 5 * p), d5); + xx_storel_32((s - 2 + 6 * p), d6); + xx_storel_32((s - 2 + 7 * p), d7); +} - dst[0] = s - 4; - dst[1] = s - 4 + p * 8; +void aom_lpf_vertical_6_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x2, x1, x0, x3; + __m128i p0, q0; + __m128i p1p0, q1q0; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); + x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); + + transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + + lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); + + xx_storel_32(s + 0 * p - 2, d0); + xx_storel_32(s + 1 * p - 2, d1); + xx_storel_32(s + 2 * p - 2, d2); + xx_storel_32(s + 3 * p - 2, d3); +} - // Transpose back - transpose(src, 16, dst, p, 2); +void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); + + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i p0, q0; + __m128i p1p0, q1q0; + __m128i d0d1, d2d3, d4d5, d6d7; + + x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p)); + + transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, + &d6d7); + + d1 = _mm_srli_si128(d0d1, 8); + d3 = _mm_srli_si128(d2d3, 8); + d5 = _mm_srli_si128(d4d5, 8); + d7 = _mm_srli_si128(d6d7, 8); + + lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit, + &limit, &thresh); + + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); + + transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5, + &d6, &d7); + + xx_storel_32((s - 2 + 0 * p), d0); + xx_storel_32((s - 2 + 1 * p), d1); + xx_storel_32((s - 2 + 2 * p), d2); + xx_storel_32((s - 2 + 3 * p), d3); + xx_storel_32((s - 2 + 4 * p), d4); + xx_storel_32((s - 2 + 5 * p), d5); + xx_storel_32((s - 2 + 6 * p), d6); + xx_storel_32((s - 2 + 7 * p), d7); } -void aom_lpf_vertical_16_sse2(unsigned char *s, int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh) { - DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]); - unsigned char *src[2]; - unsigned char *dst[2]; +void aom_lpf_vertical_8_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + + __m128i p2, p0, q0, q2; + __m128i x2, x1, x0, x3; + __m128i q1q0, p1p0; + __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + __m128i limit = _mm_load_si128((const __m128i *)_limit); + __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + + x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p)); + x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p)); + x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p)); + x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p)); + + transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, + &d7); + // Loop filtering + lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2, + &q2, &blimit, &limit, &thresh); - src[0] = s - 8; - src[1] = s; - dst[0] = t_dst; - dst[1] = t_dst + 8 * 8; + p0 = _mm_srli_si128(p1p0, 8); + q0 = _mm_srli_si128(q1q0, 8); - // Transpose 16x8 - transpose(src, p, dst, 8, 2); + transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1, + &d2, &d3); - // Loop filtering - aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); + _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0); + _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1); + _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2); + _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3); +} - src[0] = t_dst; - src[1] = t_dst + 8 * 8; - dst[0] = s - 8; - dst[1] = s; +void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0), + _mm_load_si128((__m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0), + _mm_load_si128((__m128i *)_limit1)); + __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0), + _mm_load_si128((__m128i *)_thresh1)); - // Transpose back - transpose(src, 8, dst, p, 2); + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i d1, d3, d5, d7; + __m128i q1q0, p1p0; + __m128i p2, p1, q1, q2; + __m128i d0d1, d2d3, d4d5, d6d7; + + x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p)); + x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p)); + x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p)); + x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p)); + x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p)); + + transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5, + &d6d7); + + d1 = _mm_srli_si128(d0d1, 8); + d3 = _mm_srli_si128(d2d3, 8); + d5 = _mm_srli_si128(d4d5, 8); + d7 = _mm_srli_si128(d6d7, 8); + + lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0, + &p1p0, &p2, &q2, &blimit, &limit, &thresh); + + p1 = _mm_srli_si128(p1p0, 8); + q1 = _mm_srli_si128(q1q0, 8); + + transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3, + &d4d5, &d6d7); + + _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1); + _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3); + _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5); + _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8)); + _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7); + _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8)); } -void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p, - const uint8_t *blimit, const uint8_t *limit, - const uint8_t *thresh) { - DECLARE_ALIGNED(16, unsigned char, t_dst[256]); - - // Transpose 16x16 - transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); - transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); +void aom_lpf_vertical_14_sse2(unsigned char *s, int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x6, x5, x4, x3, x2, x1, x0; + __m128i p0, p1, p2, p3, p4, p5, p6, p7; + __m128i q0, q1, q2, q3, q4, q5, q6, q7; + __m128i p0_out, p1_out, p2_out, p3_out; + __m128i blimit = _mm_load_si128((__m128i *)_blimit); + __m128i limit = _mm_load_si128((__m128i *)_limit); + __m128i thresh = _mm_load_si128((__m128i *)_thresh); + + x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p)); + x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p)); + x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p)); + + transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6, + &p7); + + x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); + x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + + transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, + &q7); + + q6p6 = _mm_unpacklo_epi64(p1, q6); + q5p5 = _mm_unpacklo_epi64(p2, q5); + q4p4 = _mm_unpacklo_epi64(p3, q4); + q3p3 = _mm_unpacklo_epi64(p4, q3); + q2p2 = _mm_unpacklo_epi64(p5, q2); + q1p1 = _mm_unpacklo_epi64(p6, q1); + q0p0 = _mm_unpacklo_epi64(p7, q0); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &p0_out, &p1_out, &p2_out, &p3_out); + + x0 = _mm_srli_si128(q0p0, 8); + x1 = _mm_srli_si128(q1p1, 8); + x2 = _mm_srli_si128(q2p2, 8); + x3 = _mm_srli_si128(q3p3, 8); + x4 = _mm_srli_si128(q4p4, 8); + x5 = _mm_srli_si128(q5p5, 8); + x6 = _mm_srli_si128(q6p6, 8); + + transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2, + &q3); + + _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out); + _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out); + _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out); + _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out); + + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + _mm_storel_epi64((__m128i *)(s + 3 * p), q3); +} - // Loop filtering - aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); +void aom_lpf_vertical_14_dual_sse2( + unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, + const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, + const uint8_t *_thresh1) { + __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x7, x6, x5, x4, x3, x2, x1, x0; + __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15; + __m128i q0, q1, q2, q3, q7; + __m128i p0p1, p2p3, p4p5, p6p7; + + __m128i blimit = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + __m128i thresh = + _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); - // Transpose back - transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); - transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); + x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p)); + x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p)); + x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p)); + x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p)); + + transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3, + &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15); + + q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8)); + q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8)); + q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8)); + q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8)); + q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8)); + q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8)); + q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8)); + q7 = _mm_srli_si128(d14d15, 8); + + lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, + &limit, &thresh); + + x0 = _mm_srli_si128(q0p0, 8); + x1 = _mm_srli_si128(q1p1, 8); + x2 = _mm_srli_si128(q2p2, 8); + x3 = _mm_srli_si128(q3p3, 8); + x4 = _mm_srli_si128(q4p4, 8); + x5 = _mm_srli_si128(q5p5, 8); + x6 = _mm_srli_si128(q6p6, 8); + + transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1, + &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3); + + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7); + _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0); + _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1); + _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2); + _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3); } diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h index 027c890dc..c6b6469b4 100644 --- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h +++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h @@ -14,117 +14,202 @@ #include // SSE2 -#include "./aom_config.h" - -static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], - int out_p, int num_8x8_to_transpose) { - int idx8x8 = 0; - __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; - do { - uint16_t *in = src[idx8x8]; - uint16_t *out = dst[idx8x8]; - - p0 = - _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 - p1 = - _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 - p2 = - _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 - p3 = - _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 - p4 = - _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 - p5 = - _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 - p6 = - _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 - p7 = - _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 - // 00 10 01 11 02 12 03 13 - x0 = _mm_unpacklo_epi16(p0, p1); - // 20 30 21 31 22 32 23 33 - x1 = _mm_unpacklo_epi16(p2, p3); - // 40 50 41 51 42 52 43 53 - x2 = _mm_unpacklo_epi16(p4, p5); - // 60 70 61 71 62 72 63 73 - x3 = _mm_unpacklo_epi16(p6, p7); - // 00 10 20 30 01 11 21 31 - x4 = _mm_unpacklo_epi32(x0, x1); - // 40 50 60 70 41 51 61 71 - x5 = _mm_unpacklo_epi32(x2, x3); - // 00 10 20 30 40 50 60 70 - x6 = _mm_unpacklo_epi64(x4, x5); - // 01 11 21 31 41 51 61 71 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); - // 00 10 20 30 40 50 60 70 - _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); - // 01 11 21 31 41 51 61 71 - - // 02 12 22 32 03 13 23 33 - x4 = _mm_unpackhi_epi32(x0, x1); - // 42 52 62 72 43 53 63 73 - x5 = _mm_unpackhi_epi32(x2, x3); - // 02 12 22 32 42 52 62 72 - x6 = _mm_unpacklo_epi64(x4, x5); - // 03 13 23 33 43 53 63 73 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); - // 02 12 22 32 42 52 62 72 - _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); - // 03 13 23 33 43 53 63 73 - - // 04 14 05 15 06 16 07 17 - x0 = _mm_unpackhi_epi16(p0, p1); - // 24 34 25 35 26 36 27 37 - x1 = _mm_unpackhi_epi16(p2, p3); - // 44 54 45 55 46 56 47 57 - x2 = _mm_unpackhi_epi16(p4, p5); - // 64 74 65 75 66 76 67 77 - x3 = _mm_unpackhi_epi16(p6, p7); - // 04 14 24 34 05 15 25 35 - x4 = _mm_unpacklo_epi32(x0, x1); - // 44 54 64 74 45 55 65 75 - x5 = _mm_unpacklo_epi32(x2, x3); - // 04 14 24 34 44 54 64 74 - x6 = _mm_unpacklo_epi64(x4, x5); - // 05 15 25 35 45 55 65 75 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); - // 04 14 24 34 44 54 64 74 - _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); - // 05 15 25 35 45 55 65 75 - - // 06 16 26 36 07 17 27 37 - x4 = _mm_unpackhi_epi32(x0, x1); - // 46 56 66 76 47 57 67 77 - x5 = _mm_unpackhi_epi32(x2, x3); - // 06 16 26 36 46 56 66 76 - x6 = _mm_unpacklo_epi64(x4, x5); - // 07 17 27 37 47 57 67 77 - x7 = _mm_unpackhi_epi64(x4, x5); - - _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); - // 06 16 26 36 46 56 66 76 - _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); - // 07 17 27 37 47 57 67 77 - } while (++idx8x8 < num_8x8_to_transpose); +#include "config/aom_config.h" + +static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5) { + __m128i w0, w1, w2, w3, w4, w5, ww0; + + // 00 01 02 03 04 05 xx xx + // 10 11 12 13 14 15 xx xx + // 20 21 22 23 24 25 xx xx + // 30 31 32 33 34 35 xx xx + // 40 41 42 43 44 45 xx xx + // 50 51 52 53 54 55 xx xx + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51 + *d1 = _mm_unpackhi_epi64(ww0, + _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx + + ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + *d2 = _mm_unpacklo_epi64(ww0, + _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx + + w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx + w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx + w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx + + *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53 + + ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35 + *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55 + *d5 = _mm_unpackhi_epi64(ww0, + _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx +} + +static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + __m128i zero = _mm_setzero_si128(); + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + + *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx + *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx + *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx + *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx +} + +static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + __m128i w0, w1, ww2, ww3; + __m128i zero = _mm_setzero_si128(); + + w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 + w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 + + ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + + *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx + *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx + *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx + *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx } -static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, - uint16_t *out, int out_p) { - uint16_t *src0[1]; - uint16_t *src1[1]; - uint16_t *dest0[1]; - uint16_t *dest1[1]; - src0[0] = in0; - src1[0] = in1; - dest0[0] = out; - dest1[0] = out + 8; - highbd_transpose(src0, in_p, dest0, out_p, 1); - highbd_transpose(src1, in_p, dest1, out_p, 1); +// here in and out pointers (x and d) should be different! we don't store their +// values inside +static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // output + // 00 10 20 30 xx xx xx xx + // 01 11 21 31 xx xx xx xx + // 02 12 22 32 xx xx xx xx + // 03 13 23 33 xx xx xx xx + // 04 14 24 34 xx xx xx xx + // 05 15 25 35 xx xx xx xx + // 06 16 26 36 xx xx xx xx + // 07 17 27 37 xx xx xx xx + highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); + highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); } + +static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + __m128i w0, w1, w2, w3, ww0, ww1; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + + w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 + w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 + w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 + w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 + ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 + + *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 + *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 + + ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 + + *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 + *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 +} + +static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *d4, __m128i *d5, + __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, ww0, ww1; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 + w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57 + w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77 + + ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 + ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 + + *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 + *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 + + ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 + ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 + + *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 + *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 +} + +// here in and out pointers (x and d) should be different! we don't store their +// values inside +static INLINE void highbd_transpose8x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, + __m128i *d7) { + highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3); + highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7); +} + +// here in and out pointers (x and d arrays) should be different! we don't store +// their values inside +static INLINE void highbd_transpose8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, + __m128i *d7) { + highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4, + d5, d6, d7); + highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1, + x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1, + d4 + 1, d5 + 1, d6 + 1, d7 + 1); +} + #endif // _AOM_DSP_X86_LPF_COMMON_X86_H diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c index 2536f91d2..1f42eec2f 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -12,8 +12,9 @@ #include #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_dsp/blend.h" #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" @@ -75,11 +76,9 @@ static INLINE unsigned int masked_sad4xh_ssse3( ref_stride, msk, msk_stride, n); \ } -#if CONFIG_EXT_PARTITION MASKSADMXN_SSSE3(128, 128) MASKSADMXN_SSSE3(128, 64) MASKSADMXN_SSSE3(64, 128) -#endif // CONFIG_EXT_PARTITION MASKSADMXN_SSSE3(64, 64) MASKSADMXN_SSSE3(64, 32) MASKSADMXN_SSSE3(32, 64) @@ -93,18 +92,12 @@ MASKSAD8XN_SSSE3(8) MASKSAD8XN_SSSE3(4) MASKSAD4XN_SSSE3(8) MASKSAD4XN_SSSE3(4) -#if CONFIG_EXT_PARTITION_TYPES MASKSAD4XN_SSSE3(16) MASKSADMXN_SSSE3(16, 4) MASKSAD8XN_SSSE3(32) MASKSADMXN_SSSE3(32, 8) MASKSADMXN_SSSE3(16, 64) MASKSADMXN_SSSE3(64, 16) -#if CONFIG_EXT_PARTITION -MASKSADMXN_SSSE3(32, 128) -MASKSADMXN_SSSE3(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, @@ -239,7 +232,6 @@ static INLINE unsigned int masked_sad4xh_ssse3( return (sad + 31) >> 6; } -#if CONFIG_HIGHBITDEPTH // For width a multiple of 8 static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, @@ -277,11 +269,9 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3( ref8, ref_stride, msk, msk_stride, n); \ } -#if CONFIG_EXT_PARTITION HIGHBD_MASKSADMXN_SSSE3(128, 128) HIGHBD_MASKSADMXN_SSSE3(128, 64) HIGHBD_MASKSADMXN_SSSE3(64, 128) -#endif // CONFIG_EXT_PARTITION HIGHBD_MASKSADMXN_SSSE3(64, 64) HIGHBD_MASKSADMXN_SSSE3(64, 32) HIGHBD_MASKSADMXN_SSSE3(32, 64) @@ -295,18 +285,12 @@ HIGHBD_MASKSADMXN_SSSE3(8, 8) HIGHBD_MASKSADMXN_SSSE3(8, 4) HIGHBD_MASKSAD4XN_SSSE3(8) HIGHBD_MASKSAD4XN_SSSE3(4) -#if CONFIG_EXT_PARTITION_TYPES HIGHBD_MASKSAD4XN_SSSE3(16) HIGHBD_MASKSADMXN_SSSE3(16, 4) HIGHBD_MASKSADMXN_SSSE3(8, 32) HIGHBD_MASKSADMXN_SSSE3(32, 8) HIGHBD_MASKSADMXN_SSSE3(16, 64) HIGHBD_MASKSADMXN_SSSE3(64, 16) -#if CONFIG_EXT_PARTITION -HIGHBD_MASKSADMXN_SSSE3(32, 128) -HIGHBD_MASKSADMXN_SSSE3(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, @@ -424,5 +408,3 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3( int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; } - -#endif diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c index 3ffe132be..d7dbefd7d 100644 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c @@ -13,13 +13,15 @@ #include #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" -#include "aom_dsp/blend.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" -#include "aom_ports/mem.h" #include "aom_dsp/aom_filter.h" +#include "aom_dsp/blend.h" +#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" #include "aom_dsp/x86/synonyms.h" +#include "aom_ports/mem.h" // For width a multiple of 16 static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset, @@ -108,11 +110,9 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \ } -#if CONFIG_EXT_PARTITION MASK_SUBPIX_VAR_SSSE3(128, 128) MASK_SUBPIX_VAR_SSSE3(128, 64) MASK_SUBPIX_VAR_SSSE3(64, 128) -#endif MASK_SUBPIX_VAR_SSSE3(64, 64) MASK_SUBPIX_VAR_SSSE3(64, 32) MASK_SUBPIX_VAR_SSSE3(32, 64) @@ -126,18 +126,12 @@ MASK_SUBPIX_VAR8XH_SSSE3(8) MASK_SUBPIX_VAR8XH_SSSE3(4) MASK_SUBPIX_VAR4XH_SSSE3(8) MASK_SUBPIX_VAR4XH_SSSE3(4) -#if CONFIG_EXT_PARTITION_TYPES MASK_SUBPIX_VAR4XH_SSSE3(16) MASK_SUBPIX_VAR_SSSE3(16, 4) MASK_SUBPIX_VAR8XH_SSSE3(32) MASK_SUBPIX_VAR_SSSE3(32, 8) MASK_SUBPIX_VAR_SSSE3(64, 16) MASK_SUBPIX_VAR_SSSE3(16, 64) -#if CONFIG_EXT_PARTITION -MASK_SUBPIX_VAR_SSSE3(128, 32) -MASK_SUBPIX_VAR_SSSE3(32, 128) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES static INLINE __m128i filter_block(const __m128i a, const __m128i b, const __m128i filter) { @@ -523,7 +517,6 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride, *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } -#if CONFIG_HIGHBITDEPTH // For width a multiple of 8 static void highbd_bilinear_filter(const uint16_t *src, int src_stride, int xoffset, int yoffset, uint16_t *dst, @@ -695,11 +688,9 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, return (var >= 0) ? (uint32_t)var : 0; \ } -#if CONFIG_EXT_PARTITION HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128) HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64) HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128) -#endif HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64) HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64) @@ -713,18 +704,12 @@ HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4) HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8) HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4) -#if CONFIG_EXT_PARTITION_TYPES HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4) HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32) HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8) HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64) HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16) -#if CONFIG_EXT_PARTITION -HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 128) -HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 32) -#endif -#endif static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b, const __m128i filter) { @@ -1040,4 +1025,40 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride, *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4)); } -#endif +void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + assert(height % 2 == 0); + int i = 0; + if (width == 8) { + comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, + mask, mask_stride); + } else if (width == 16) { + do { + comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); + comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1, + mask + mask_stride, comp_pred + width); + comp_pred += (width << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); + } else { // width == 32 + assert(width == 32); + do { + comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred); + comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16); + comp_pred += (width); + src0 += (stride0); + src1 += (stride1); + mask += (mask_stride); + i += 1; + } while (i < height); + } +} diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h new file mode 100644 index 000000000..dc41a8342 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H +#define _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" + +static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0, + const uint8_t *src1, + const uint8_t *mask, uint8_t *dst) { + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + + const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0)); + const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1)); + const __m128i aA = _mm_load_si128((const __m128i *)(mask)); + + const __m128i maA = _mm_sub_epi8(alpha_max, aA); + + const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1); + const __m128i aaAL = _mm_unpacklo_epi8(aA, maA); + const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1); + const __m128i aaAH = _mm_unpackhi_epi8(aA, maA); + + const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL); + const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH); + + const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset); + const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset); + _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH)); +} + +static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height, + const uint8_t *src0, int stride0, + const uint8_t *src1, int stride1, + const uint8_t *mask, + int mask_stride) { + int i = 0; + const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i round_offset = + _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { + // odd line A + const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0)); + const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1)); + const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask)); + // even line B + const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0)); + const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1)); + const __m128i a = _mm_castps_si128(_mm_loadh_pi( + _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride))); + + const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1); + const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1); + + const __m128i ma = _mm_sub_epi8(alpha_max, a); + const __m128i aaA = _mm_unpacklo_epi8(a, ma); + const __m128i aaB = _mm_unpackhi_epi8(a, ma); + + const __m128i blendA = _mm_maddubs_epi16(ssA, aaA); + const __m128i blendB = _mm_maddubs_epi16(ssB, aaB); + const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset); + const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset); + const __m128i round = _mm_packus_epi16(roundA, roundB); + // comp_pred's stride == width == 8 + _mm_store_si128((__m128i *)(comp_pred), round); + comp_pred += (8 << 1); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); +} + +#endif diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h new file mode 100644 index 000000000..8b69606dd --- /dev/null +++ b/third_party/aom/aom_dsp/x86/mem_sse2.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2017, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_MEM_SSE2_H_ +#define AOM_DSP_X86_MEM_SSE2_H_ + +#include // SSE2 + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) { + return _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); +} + +static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src, + const int byte_stride) { + return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride), + *(const int32_t *)((int8_t *)src + 1 * byte_stride), + *(const int32_t *)((int8_t *)src + 2 * byte_stride), + *(const int32_t *)((int8_t *)src + 3 * byte_stride)); +} + +static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, + const int byte_stride) { + __m128i dst; + dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); + dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); + return dst; +} + +#endif // AOM_DSP_X86_MEM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h index 73589a32a..a3535f985 100644 --- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h @@ -14,7 +14,7 @@ #include -#include "./aom_config.h" +#include "config/aom_config.h" static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) { v_d = _mm_hadd_epi32(v_d, v_d); diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c index 52dd508ec..0338a8c77 100644 --- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c @@ -12,7 +12,8 @@ #include #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom_ports/mem.h" #include "aom/aom_integer.h" @@ -24,9 +25,11 @@ // 8 bit //////////////////////////////////////////////////////////////////////////////// -static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, - const int32_t *wsrc, const int32_t *mask, - const int height) { +static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { const int pre_step = pre_stride - 4; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); @@ -59,11 +62,9 @@ static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, return xx_hsum_epi32_si32(v_sad_d); } -static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, const int width, - const int height) { +static AOM_FORCE_INLINE unsigned int obmc_sad_w8n( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); @@ -119,11 +120,9 @@ static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre, } \ } -#if CONFIG_EXT_PARTITION OBMCSADWXH(128, 128) OBMCSADWXH(128, 64) OBMCSADWXH(64, 128) -#endif // CONFIG_EXT_PARTITION OBMCSADWXH(64, 64) OBMCSADWXH(64, 32) OBMCSADWXH(32, 64) @@ -137,25 +136,22 @@ OBMCSADWXH(8, 8) OBMCSADWXH(8, 4) OBMCSADWXH(4, 8) OBMCSADWXH(4, 4) -#if CONFIG_EXT_PARTITION_TYPES OBMCSADWXH(4, 16) OBMCSADWXH(16, 4) OBMCSADWXH(8, 32) OBMCSADWXH(32, 8) OBMCSADWXH(16, 64) OBMCSADWXH(64, 16) -#endif //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// -#if CONFIG_HIGHBITDEPTH -static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - const int height) { +static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - 4; int n = 0; @@ -189,11 +185,9 @@ static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8, return xx_hsum_epi32_si32(v_sad_d); } -static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, - const int pre_stride, - const int32_t *wsrc, - const int32_t *mask, - const int width, const int height) { +static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; @@ -250,11 +244,9 @@ static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, } \ } -#if CONFIG_EXT_PARTITION HBD_OBMCSADWXH(128, 128) HBD_OBMCSADWXH(128, 64) HBD_OBMCSADWXH(64, 128) -#endif // CONFIG_EXT_PARTITION HBD_OBMCSADWXH(64, 64) HBD_OBMCSADWXH(64, 32) HBD_OBMCSADWXH(32, 64) @@ -268,12 +260,9 @@ HBD_OBMCSADWXH(8, 8) HBD_OBMCSADWXH(8, 4) HBD_OBMCSADWXH(4, 8) HBD_OBMCSADWXH(4, 4) -#if CONFIG_EXT_PARTITION_TYPES HBD_OBMCSADWXH(4, 16) HBD_OBMCSADWXH(16, 4) HBD_OBMCSADWXH(8, 32) HBD_OBMCSADWXH(32, 8) HBD_OBMCSADWXH(16, 64) HBD_OBMCSADWXH(64, 16) -#endif -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 392616af3..571aa770b 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -12,7 +12,8 @@ #include #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom_ports/mem.h" #include "aom/aom_integer.h" @@ -128,11 +129,9 @@ static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } -#if CONFIG_EXT_PARTITION OBMCVARWXH(128, 128) OBMCVARWXH(128, 64) OBMCVARWXH(64, 128) -#endif // CONFIG_EXT_PARTITION OBMCVARWXH(64, 64) OBMCVARWXH(64, 32) OBMCVARWXH(32, 64) @@ -146,24 +145,17 @@ OBMCVARWXH(8, 8) OBMCVARWXH(8, 4) OBMCVARWXH(4, 8) OBMCVARWXH(4, 4) -#if CONFIG_EXT_PARTITION_TYPES OBMCVARWXH(4, 16) OBMCVARWXH(16, 4) OBMCVARWXH(8, 32) OBMCVARWXH(32, 8) OBMCVARWXH(16, 64) OBMCVARWXH(64, 16) -#if CONFIG_EXT_PARTITION -OBMCVARWXH(32, 128) -OBMCVARWXH(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// -#if CONFIG_HIGHBITDEPTH static INLINE void hbd_obmc_variance_w4( const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) { @@ -278,8 +270,19 @@ static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride, uint64_t sse64 = 0; if (w == 4) { hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); - } else { + } else if (w < 128 || h < 128) { hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } else { + assert(w == 128 && h == 128); + + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, + 64); + pre8 += 64 * pre_stride; + wsrc += 64 * w; + mask += 64 * w; + h -= 64; + } while (h > 0); } *sum = (int)ROUND_POWER_OF_TWO(sum64, 2); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4); @@ -291,28 +294,23 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, unsigned int *sse, int *sum) { int64_t sum64 = 0; uint64_t sse64 = 0; - if (w == 128) { - do { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128, - 32); - pre8 += 32 * pre_stride; - wsrc += 32 * 128; - mask += 32 * 128; - h -= 32; - } while (h > 0); - } else if (w == 64 && h >= 128) { - do { - hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64, - 64); - pre8 += 64 * pre_stride; - wsrc += 64 * 64; - mask += 64 * 64; - h -= 64; - } while (h > 0); - } else if (w == 4) { + int max_pel_allowed_per_ovf = 512; + if (w == 4) { hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h); - } else { + } else if (w * h <= max_pel_allowed_per_ovf) { hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h); + } else { + int h_per_ovf = max_pel_allowed_per_ovf / w; + + assert(max_pel_allowed_per_ovf % w == 0); + do { + hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, + h_per_ovf); + pre8 += h_per_ovf * pre_stride; + wsrc += h_per_ovf * w; + mask += h_per_ovf * w; + h -= h_per_ovf; + } while (h > 0); } *sum = (int)ROUND_POWER_OF_TWO(sum64, 4); *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8); @@ -347,11 +345,9 @@ static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride, return (var >= 0) ? (uint32_t)var : 0; \ } -#if CONFIG_EXT_PARTITION HBD_OBMCVARWXH(128, 128) HBD_OBMCVARWXH(128, 64) HBD_OBMCVARWXH(64, 128) -#endif // CONFIG_EXT_PARTITION HBD_OBMCVARWXH(64, 64) HBD_OBMCVARWXH(64, 32) HBD_OBMCVARWXH(32, 64) @@ -365,16 +361,9 @@ HBD_OBMCVARWXH(8, 8) HBD_OBMCVARWXH(8, 4) HBD_OBMCVARWXH(4, 8) HBD_OBMCVARWXH(4, 4) -#if CONFIG_EXT_PARTITION_TYPES HBD_OBMCVARWXH(4, 16) HBD_OBMCVARWXH(16, 4) HBD_OBMCVARWXH(8, 32) HBD_OBMCVARWXH(32, 8) HBD_OBMCVARWXH(16, 64) HBD_OBMCVARWXH(64, 16) -#if CONFIG_EXT_PARTITION -HBD_OBMCVARWXH(32, 128) -HBD_OBMCVARWXH(128, 32) -#endif // CONFIG_EXT_PARTITION -#endif // CONFIG_EXT_PARTITION_TYPES -#endif // CONFIG_HIGHBITDEPTH diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm index 954a95b98..e6b40262d 100644 --- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm @@ -44,16 +44,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova m0, [zbinq] ; m0 = zbin ; Get DC and first 15 AC coeffs - in this special case, that is all. -%if CONFIG_HIGHBITDEPTH ; coeff stored as 32bit numbers but we process them as 16 bit numbers mova m9, [coeffq] packssdw m9, [coeffq+16] ; m9 = c[i] mova m10, [coeffq+32] packssdw m10, [coeffq+48] ; m10 = c[i] -%else - mova m9, [coeffq] ; m9 = c[i] - mova m10, [coeffq+16] ; m10 = c[i] -%endif mov r0, eobmp ; Output pointer mov r1, qcoeffmp ; Output pointer @@ -76,15 +71,10 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ ptest m14, m14 jnz .single_nonzero -%if CONFIG_HIGHBITDEPTH mova [r1 ], ymm5 mova [r1+32], ymm5 mova [r2 ], ymm5 mova [r2+32], ymm5 -%else - mova [r1], ymm5 - mova [r2], ymm5 -%endif mov [r0], word 0 vzeroupper @@ -124,7 +114,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pand m8, m7 pand m13, m12 -%if CONFIG_HIGHBITDEPTH ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m8 punpckhwd m6, m8, m6 @@ -136,16 +125,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmovsxwd m11, m13 mova [qcoeffq+32], m11 mova [qcoeffq+48], m6 -%else - mova [qcoeffq ], m8 - mova [qcoeffq+16], m13 -%endif pmullw m8, m3 ; dqc[i] = qc[i] * q punpckhqdq m3, m3 pmullw m13, m3 ; dqc[i] = qc[i] * q -%if CONFIG_HIGHBITDEPTH ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m8 punpckhwd m6, m8, m6 @@ -157,10 +141,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pmovsxwd m11, m13 mova [dqcoeffq+32], m11 mova [dqcoeffq+48], m6 -%else - mova [dqcoeffq ], m8 - mova [dqcoeffq+16], m13 -%endif mova m6, [iscanq] ; m6 = scan[i] mova m11, [iscanq+16] ; m11 = scan[i] @@ -229,29 +209,20 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob -%if CONFIG_HIGHBITDEPTH + lea coeffq, [ coeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] lea dqcoeffq, [dqcoeffq+ncoeffq*4] -%else - lea coeffq, [ coeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] -%endif + lea iscanq, [ iscanq+ncoeffq*2] neg ncoeffq ; get DC and first 15 AC coeffs -%if CONFIG_HIGHBITDEPTH ; coeff stored as 32bit numbers & require 16bit numbers mova m9, [coeffq+ncoeffq*4+ 0] packssdw m9, [coeffq+ncoeffq*4+16] mova m10, [coeffq+ncoeffq*4+32] packssdw m10, [coeffq+ncoeffq*4+48] -%else - mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) @@ -264,16 +235,10 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ ptest m14, m14 jnz .first_nonzero -%if CONFIG_HIGHBITDEPTH mova [qcoeffq+ncoeffq*4 ], ymm5 mova [qcoeffq+ncoeffq*4+32], ymm5 mova [dqcoeffq+ncoeffq*4 ], ymm5 mova [dqcoeffq+ncoeffq*4+32], ymm5 -%else - mova [qcoeffq+ncoeffq*2], ymm5 - mova [dqcoeffq+ncoeffq*2], ymm5 -%endif - add ncoeffq, mmsize punpckhqdq m1, m1 @@ -302,7 +267,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pand m8, m7 pand m13, m12 -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m8 punpckhwd m6, m8, m6 @@ -314,10 +278,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pmovsxwd m11, m13 mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 -%else - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif %ifidn %1, b_32x32 pabsw m8, m8 @@ -333,7 +293,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ psignw m13, m10 %endif -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m8 punpckhwd m6, m8, m6 @@ -345,10 +304,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pmovsxwd m11, m13 mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 @@ -363,16 +318,11 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ .ac_only_loop: -%if CONFIG_HIGHBITDEPTH ; pack coeff from 32bit to 16bit array mova m9, [coeffq+ncoeffq*4+ 0] packssdw m9, [coeffq+ncoeffq*4+16] mova m10, [coeffq+ncoeffq*4+32] packssdw m10, [coeffq+ncoeffq*4+48] -%else - mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) @@ -385,15 +335,11 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ ptest m14, m14 jnz .rest_nonzero -%if CONFIG_HIGHBITDEPTH mova [qcoeffq+ncoeffq*4+ 0], ymm5 mova [qcoeffq+ncoeffq*4+32], ymm5 mova [dqcoeffq+ncoeffq*4+ 0], ymm5 mova [dqcoeffq+ncoeffq*4+32], ymm5 -%else - mova [qcoeffq+ncoeffq*2+ 0], ymm5 - mova [dqcoeffq+ncoeffq*2+ 0], ymm5 -%endif + add ncoeffq, mmsize jnz .ac_only_loop @@ -424,7 +370,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pand m14, m7 pand m13, m12 -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m14 punpckhwd m6, m14, m6 @@ -436,10 +381,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pmovsxwd m11, m13 mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 -%else - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif %ifidn %1, b_32x32 pabsw m14, m14 @@ -454,7 +395,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ psignw m13, m10 %endif -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pcmpgtw m6, m5, m14 punpckhwd m6, m14, m6 @@ -466,10 +406,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ pmovsxwd m11, m13 mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 @@ -510,27 +446,16 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob -%if CONFIG_HIGHBITDEPTH lea dqcoeffq, [dqcoeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] -%else - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] -%endif - neg ncoeffq pxor m7, m7 .blank_loop: -%if CONFIG_HIGHBITDEPTH mova [dqcoeffq+ncoeffq*4+ 0], ymm7 mova [dqcoeffq+ncoeffq*4+32], ymm7 mova [qcoeffq+ncoeffq*4+ 0], ymm7 mova [qcoeffq+ncoeffq*4+32], ymm7 -%else - mova [dqcoeffq+ncoeffq*2+ 0], ymm7 - mova [qcoeffq+ncoeffq*2+ 0], ymm7 -%endif add ncoeffq, mmsize jl .blank_loop @@ -543,5 +468,3 @@ DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob INIT_XMM avx QUANTIZE_FN b, 7 QUANTIZE_FN b_32x32, 7 - -END diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c index 0e7f679d0..46b9c7d29 100644 --- a/third_party/aom/aom_dsp/x86/quantize_sse2.c +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -12,7 +12,8 @@ #include #include -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm index 36b4dddbd..e2c1ebb71 100644 --- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -45,7 +45,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psrlw m1, 1 ; m1 = (m1 + 1) / 2 %endif mova m3, [r2q] ; m3 = dequant - psubw m0, [pw_1] + psubw m0, [GLOBAL(pw_1)] mov r2, shiftmp mov r3, qcoeffmp mova m4, [r2] ; m4 = shift @@ -56,29 +56,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endif pxor m5, m5 ; m5 = dedicated zero DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob -%if CONFIG_HIGHBITDEPTH lea coeffq, [ coeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] lea dqcoeffq, [dqcoeffq+ncoeffq*4] -%else - lea coeffq, [ coeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] - lea dqcoeffq, [dqcoeffq+ncoeffq*2] -%endif lea iscanq, [ iscanq+ncoeffq*2] neg ncoeffq ; get DC and first 15 AC coeffs -%if CONFIG_HIGHBITDEPTH ; coeff stored as 32bit numbers & require 16bit numbers mova m9, [ coeffq+ncoeffq*4+ 0] packssdw m9, [ coeffq+ncoeffq*4+16] mova m10, [ coeffq+ncoeffq*4+32] packssdw m10, [ coeffq+ncoeffq*4+48] -%else - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin @@ -99,7 +88,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m13, m10 ; m13 = reinsert sign pand m8, m7 pand m13, m12 -%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff mova m11, m8 mova m6, m8 @@ -117,10 +106,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register -%else - mova [qcoeffq+ncoeffq*2+ 0], m8 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif + %ifidn %1, b_32x32 pabsw m8, m8 pabsw m13, m13 @@ -134,7 +120,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m8, m9 psignw m13, m10 %endif -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff mova m11, m8 mova m6, m8 @@ -152,10 +137,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register -%else - mova [dqcoeffq+ncoeffq*2+ 0], m8 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif pcmpeqw m8, m5 ; m8 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] @@ -169,16 +150,12 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ jz .accumulate_eob .ac_only_loop: -%if CONFIG_HIGHBITDEPTH ; pack coeff from 32bit to 16bit array mova m9, [ coeffq+ncoeffq*4+ 0] packssdw m9, [ coeffq+ncoeffq*4+16] mova m10, [ coeffq+ncoeffq*4+32] packssdw m10, [ coeffq+ncoeffq*4+48] -%else - mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] - mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] -%endif + pabsw m6, m9 ; m6 = abs(m9) pabsw m11, m10 ; m11 = abs(m10) pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin @@ -201,7 +178,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m13, m10 ; m13 = reinsert sign pand m14, m7 pand m13, m12 -%if CONFIG_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff pxor m11, m11 mova m11, m14 @@ -220,10 +196,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [qcoeffq+ncoeffq*4+32], m11 mova [qcoeffq+ncoeffq*4+48], m6 pxor m5, m5 ; reset m5 to zero register -%else - mova [qcoeffq+ncoeffq*2+ 0], m14 - mova [qcoeffq+ncoeffq*2+16], m13 -%endif + %ifidn %1, b_32x32 pabsw m14, m14 pabsw m13, m13 @@ -236,7 +209,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psignw m14, m9 psignw m13, m10 %endif -%if CONFIG_HIGHBITDEPTH + ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff mova m11, m14 mova m6, m14 @@ -254,10 +227,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [dqcoeffq+ncoeffq*4+32], m11 mova [dqcoeffq+ncoeffq*4+48], m6 pxor m5, m5 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m14 - mova [dqcoeffq+ncoeffq*2+16], m13 -%endif + pcmpeqw m14, m5 ; m14 = c[i] == 0 pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] @@ -274,7 +244,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %ifidn %1, b_32x32 jmp .accumulate_eob .skip_iter: -%if CONFIG_HIGHBITDEPTH mova [qcoeffq+ncoeffq*4+ 0], m5 mova [qcoeffq+ncoeffq*4+16], m5 mova [qcoeffq+ncoeffq*4+32], m5 @@ -283,12 +252,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [dqcoeffq+ncoeffq*4+16], m5 mova [dqcoeffq+ncoeffq*4+32], m5 mova [dqcoeffq+ncoeffq*4+48], m5 -%else - mova [qcoeffq+ncoeffq*2+ 0], m5 - mova [qcoeffq+ncoeffq*2+16], m5 - mova [dqcoeffq+ncoeffq*2+ 0], m5 - mova [dqcoeffq+ncoeffq*2+16], m5 -%endif add ncoeffq, mmsize jl .ac_only_loop %endif @@ -313,17 +276,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mov r2, qcoeffmp mov r3, eobmp DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob -%if CONFIG_HIGHBITDEPTH lea dqcoeffq, [dqcoeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] -%else - lea dqcoeffq, [dqcoeffq+ncoeffq*2] - lea qcoeffq, [ qcoeffq+ncoeffq*2] -%endif neg ncoeffq pxor m7, m7 .blank_loop: -%if CONFIG_HIGHBITDEPTH mova [dqcoeffq+ncoeffq*4+ 0], m7 mova [dqcoeffq+ncoeffq*4+16], m7 mova [dqcoeffq+ncoeffq*4+32], m7 @@ -332,12 +289,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ mova [qcoeffq+ncoeffq*4+16], m7 mova [qcoeffq+ncoeffq*4+32], m7 mova [qcoeffq+ncoeffq*4+48], m7 -%else - mova [dqcoeffq+ncoeffq*2+ 0], m7 - mova [dqcoeffq+ncoeffq*2+16], m7 - mova [qcoeffq+ncoeffq*2+ 0], m7 - mova [qcoeffq+ncoeffq*2+16], m7 -%endif add ncoeffq, mmsize jl .blank_loop mov word [eobq], 0 diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c index e60f518b4..f662b62b1 100644 --- a/third_party/aom/aom_dsp/x86/sad4d_avx2.c +++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c @@ -9,7 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include // AVX2 -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" + #include "aom/aom_integer.h" void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride, diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm index 2c67f450f..55a856985 100644 --- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm @@ -233,11 +233,9 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ %endmacro INIT_XMM sse2 -%if CONFIG_EXT_PARTITION SADNXN4D 128, 128 SADNXN4D 128, 64 SADNXN4D 64, 128 -%endif SADNXN4D 64, 64 SADNXN4D 64, 32 SADNXN4D 32, 64 @@ -251,11 +249,9 @@ SADNXN4D 8, 8 SADNXN4D 8, 4 SADNXN4D 4, 8 SADNXN4D 4, 4 -%if CONFIG_EXT_PARTITION_TYPES SADNXN4D 4, 16 SADNXN4D 16, 4 SADNXN4D 8, 32 SADNXN4D 32, 8 SADNXN4D 16, 64 SADNXN4D 64, 16 -%endif diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c index efba61289..a50dba64a 100644 --- a/third_party/aom/aom_dsp/x86/sad_avx2.c +++ b/third_party/aom/aom_dsp/x86/sad_avx2.c @@ -9,7 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" + #include "aom_ports/mem.h" #define FSAD64_H(h) \ diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c index e8dd87a26..b506d4663 100644 --- a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c +++ b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c @@ -11,10 +11,11 @@ #include -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms_avx2.h" #include "aom_ports/mem.h" // SAD @@ -360,7 +361,6 @@ unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride, return sum; } -#if CONFIG_EXT_PARTITION static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr, const uint16_t *sec_ptr, __m256i *sad_acc) { __m256i s[8], r[8]; @@ -471,7 +471,6 @@ unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride, sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride); return sum; } -#endif // CONFIG_EXT_PARTITION // If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD. static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride, @@ -649,7 +648,6 @@ unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride, return sum; } -#if CONFIG_EXT_PARTITION unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { @@ -697,19 +695,13 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride, second_pred); return sum; } -#endif // CONFIG_EXT_PARTITION // SAD 4D // Combine 4 __m256i vectors to uint32_t result[4] static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v, uint32_t *res) { __m256i u0, u1, u2, u3; -#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 - const __m256i mask = _mm256_setr_epi32(UINT32_MAX, 0, UINT32_MAX, 0, - UINT32_MAX, 0, UINT32_MAX, 0); -#else - const __m256i mask = _mm256_set1_epi64x(UINT32_MAX); -#endif + const __m256i mask = yy_set1_64_from_32i(UINT32_MAX); __m128i sad; // 8 32-bit summation @@ -967,7 +959,6 @@ void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride, sad_array[3] = first_half[3] + second_half[3]; } -#if CONFIG_EXT_PARTITION void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride, const uint8_t *const ref_array[], int ref_stride, uint32_t *sad_array) { @@ -1045,4 +1036,3 @@ void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride, sad_array[2] = first_half[2] + second_half[2]; sad_array[3] = first_half[3] + second_half[3]; } -#endif // CONFIG_EXT_PARTITION diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c index 4419c65b2..c6fd62c9e 100644 --- a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c +++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c @@ -10,7 +10,8 @@ */ #include -#include "./aom_dsp_rtcd.h" + +#include "config/aom_dsp_rtcd.h" static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride) { diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm index b4cc6abf1..3251b7655 100644 --- a/third_party/aom/aom_dsp/x86/sad_sse2.asm +++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm @@ -47,7 +47,6 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \ %endif ; %3 == 7 %endmacro -%if CONFIG_EXT_PARTITION ; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); %macro SAD128XN 1-2 0 @@ -114,7 +113,6 @@ SAD128XN 128 ; sad128x128_sse2 SAD128XN 128, 1 ; sad128x128_avg_sse2 SAD128XN 64 ; sad128x64_sse2 SAD128XN 64, 1 ; sad128x64_avg_sse2 -%endif ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, @@ -155,18 +153,14 @@ SAD128XN 64, 1 ; sad128x64_avg_sse2 %endmacro INIT_XMM sse2 -%if CONFIG_EXT_PARTITION SAD64XN 128 ; sad64x128_sse2 SAD64XN 128, 1 ; sad64x128_avg_sse2 -%endif SAD64XN 64 ; sad64x64_sse2 SAD64XN 32 ; sad64x32_sse2 SAD64XN 64, 1 ; sad64x64_avg_sse2 SAD64XN 32, 1 ; sad64x32_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES SAD64XN 16 ; sad64x16_sse2 SAD64XN 16, 1 ; sad64x16_avg_sse2 -%endif ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -212,10 +206,8 @@ SAD32XN 16 ; sad32x16_sse2 SAD32XN 64, 1 ; sad32x64_avg_sse2 SAD32XN 32, 1 ; sad32x32_avg_sse2 SAD32XN 16, 1 ; sad32x16_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES SAD32XN 8 ; sad_32x8_sse2 SAD32XN 8, 1 ; sad_32x8_avg_sse2 -%endif ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -262,12 +254,10 @@ SAD16XN 8 ; sad16x8_sse2 SAD16XN 32, 1 ; sad16x32_avg_sse2 SAD16XN 16, 1 ; sad16x16_avg_sse2 SAD16XN 8, 1 ; sad16x8_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES SAD16XN 4 ; sad_16x4_sse2 SAD16XN 4, 1 ; sad_16x4_avg_sse2 SAD16XN 64 ; sad_16x64_sse2 SAD16XN 64, 1 ; sad_16x64_avg_sse2 -%endif ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -312,10 +302,8 @@ SAD8XN 4 ; sad8x4_sse2 SAD8XN 16, 1 ; sad8x16_avg_sse2 SAD8XN 8, 1 ; sad8x8_avg_sse2 SAD8XN 4, 1 ; sad8x4_avg_sse2 -%if CONFIG_EXT_PARTITION_TYPES SAD8XN 32 ; sad_8x32_sse2 SAD8XN 32, 1 ; sad_8x32_avg_sse2 -%endif ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, ; uint8_t *ref, int ref_stride); @@ -361,7 +349,5 @@ SAD4XN 8 ; sad4x8_sse SAD4XN 4 ; sad4x4_sse SAD4XN 8, 1 ; sad4x8_avg_sse SAD4XN 4, 1 ; sad4x4_avg_sse -%if CONFIG_EXT_PARTITION_TYPES SAD4XN 16 ; sad_4x16_sse2 SAD4XN 16, 1 ; sad_4x16_avg_sse2 -%endif diff --git a/third_party/aom/aom_dsp/x86/sad_sse3.asm b/third_party/aom/aom_dsp/x86/sad_sse3.asm deleted file mode 100644 index f6c27c855..000000000 --- a/third_party/aom/aom_dsp/x86/sad_sse3.asm +++ /dev/null @@ -1,377 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -%macro STACK_FRAME_CREATE_X3 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define ref_ptr rdi - %define ref_stride rdx - %define end_ptr rcx - %define ret_var rbx - %define result_ptr arg(4) - %define height dword ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - mov rsi, arg(0) ; src_ptr - mov rdi, arg(2) ; ref_ptr - - movsxd rax, dword ptr arg(1) ; src_stride - movsxd rdx, dword ptr arg(3) ; ref_stride -%else - %if LIBAOM_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define ref_ptr r8 - %define ref_stride r9 - %define end_ptr r10 - %define ret_var r11 - %define result_ptr [rsp+xmm_stack_space+8+4*8] - %define height dword ptr [rsp+xmm_stack_space+8+4*8] - %else - %define src_ptr rdi - %define src_stride rsi - %define ref_ptr rdx - %define ref_stride rcx - %define end_ptr r9 - %define ret_var r10 - %define result_ptr r8 - %define height r8 - %endif -%endif - -%endmacro - -%macro STACK_FRAME_DESTROY_X3 0 - %define src_ptr - %define src_stride - %define ref_ptr - %define ref_stride - %define end_ptr - %define ret_var - %define result_ptr - %define height - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBAOM_YASM_WIN64 - RESTORE_XMM - %endif -%endif - ret -%endmacro - -%macro PROCESS_16X2X3 5 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm5, XMMWORD PTR [%3] - lddqu xmm6, XMMWORD PTR [%3+1] - lddqu xmm7, XMMWORD PTR [%3+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%3+1] - lddqu xmm3, XMMWORD PTR [%3+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [%2+%4] - lddqu xmm1, XMMWORD PTR [%3+%5] - lddqu xmm2, XMMWORD PTR [%3+%5+1] - lddqu xmm3, XMMWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_8X2X3 5 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm5, QWORD PTR [%3] - movq mm6, QWORD PTR [%3+1] - movq mm7, QWORD PTR [%3+2] - - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%3+1] - movq mm3, QWORD PTR [%3+2] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endif - movq mm0, QWORD PTR [%2+%4] - movq mm1, QWORD PTR [%3+%5] - movq mm2, QWORD PTR [%3+%5+1] - movq mm3, QWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endmacro - -;void int aom_sad16x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad16x16x3_sse3) PRIVATE -sym(aom_sad16x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int aom_sad16x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad16x8x3_sse3) PRIVATE -sym(aom_sad16x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int aom_sad8x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad8x16x3_sse3) PRIVATE -sym(aom_sad8x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int aom_sad8x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad8x8x3_sse3) PRIVATE -sym(aom_sad8x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int aom_sad4x4x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad4x4x3_sse3) PRIVATE -sym(aom_sad4x4x3_sse3): - - STACK_FRAME_CREATE_X3 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [ref_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [ref_ptr+1] - movd mm5, DWORD PTR [ref_ptr+2] - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm3, DWORD PTR [ref_ptr+ref_stride+2] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - psadbw mm4, mm0 - psadbw mm5, mm0 - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [ref_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm6, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm6 - - movd mm3, DWORD PTR [ref_ptr+1] - movd mm7, DWORD PTR [ref_ptr+2] - - psadbw mm2, mm0 - - paddw mm1, mm2 - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm6, DWORD PTR [ref_ptr+ref_stride+2] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm6 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - paddw mm3, mm4 - paddw mm7, mm5 - - mov rcx, result_ptr - - punpckldq mm1, mm3 - - movq [rcx], mm1 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 diff --git a/third_party/aom/aom_dsp/x86/sad_sse4.asm b/third_party/aom/aom_dsp/x86/sad_sse4.asm deleted file mode 100644 index 5e9c75845..000000000 --- a/third_party/aom/aom_dsp/x86/sad_sse4.asm +++ /dev/null @@ -1,362 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X8 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm1, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm1, xmm2 - paddw xmm1, xmm3 - paddw xmm1, xmm4 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endif - movdqa xmm0, XMMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - movq xmm2, MMWORD PTR [rdi+ rdx+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_8X2X8 1 -%if %1 - movq xmm0, MMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm1, xmm2 -%else - movq xmm0, MMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endif - movq xmm0, MMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_4X2X8 1 -%if %1 - movd xmm0, [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - mpsadbw xmm1, xmm0, 0x0 -%else - movd xmm0, [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endif - movd xmm0, [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endmacro - -%macro WRITE_AS_INTS 0 - mov rdi, arg(4) ;Results - pxor xmm0, xmm0 - movdqa xmm2, xmm1 - punpcklwd xmm1, xmm0 - punpckhwd xmm2, xmm0 - - movdqa [rdi], xmm1 - movdqa [rdi + 16], xmm2 -%endmacro - -;void aom_sad16x16x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array); -global sym(aom_sad16x16x8_sse4_1) PRIVATE -sym(aom_sad16x16x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_sad16x8x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(aom_sad16x8x8_sse4_1) PRIVATE -sym(aom_sad16x8x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_sad8x8x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(aom_sad8x8x8_sse4_1) PRIVATE -sym(aom_sad8x8x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_sad8x16x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(aom_sad8x16x8_sse4_1) PRIVATE -sym(aom_sad8x16x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_sad4x4x8_sse4_1( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(aom_sad4x4x8_sse4_1) PRIVATE -sym(aom_sad4x4x8_sse4_1): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - - diff --git a/third_party/aom/aom_dsp/x86/sad_ssse3.asm b/third_party/aom/aom_dsp/x86/sad_ssse3.asm deleted file mode 100644 index 96b64b040..000000000 --- a/third_party/aom/aom_dsp/x86/sad_ssse3.asm +++ /dev/null @@ -1,373 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X3 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm5, XMMWORD PTR [rdi] - lddqu xmm6, XMMWORD PTR [rdi+1] - lddqu xmm7, XMMWORD PTR [rdi+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm1, XMMWORD PTR [rdi] - lddqu xmm2, XMMWORD PTR [rdi+1] - lddqu xmm3, XMMWORD PTR [rdi+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - lddqu xmm1, XMMWORD PTR [rdi+rdx] - lddqu xmm2, XMMWORD PTR [rdi+rdx+1] - lddqu xmm3, XMMWORD PTR [rdi+rdx+2] - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X2X3_OFFSET 2 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm7, XMMWORD PTR [rdi+16] - - movdqa xmm5, xmm7 - palignr xmm5, xmm4, %2 - - movdqa xmm6, xmm7 - palignr xmm6, xmm4, (%2+1) - - palignr xmm7, xmm4, (%2+2) - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm3, XMMWORD PTR [rdi+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - movdqa xmm4, XMMWORD PTR [rdi+rdx] - movdqa xmm3, XMMWORD PTR [rdi+rdx+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X16X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -%macro PROCESS_16X8X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -;void int aom_sad16x16x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad16x16x3_ssse3) PRIVATE -sym(aom_sad16x16x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .aom_sad16x16x3_ssse3_skiptable -.aom_sad16x16x3_ssse3_jumptable: - dd .aom_sad16x16x3_ssse3_aligned_by_0 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_1 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_2 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_3 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_4 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_5 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_6 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_7 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_8 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_9 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_10 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_11 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_12 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_13 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_14 - .aom_sad16x16x3_ssse3_do_jump - dd .aom_sad16x16x3_ssse3_aligned_by_15 - .aom_sad16x16x3_ssse3_do_jump -.aom_sad16x16x3_ssse3_skiptable: - - call .aom_sad16x16x3_ssse3_do_jump -.aom_sad16x16x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .aom_sad16x16x3_ssse3_jumptable - .aom_sad16x16x3_ssse3_do_jump - add rax, rcx ; get the absolute address of aom_sad16x16x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X16X3_OFFSET 0, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 1, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 2, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 3, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 4, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 5, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 6, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 7, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 8, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 9, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 10, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 11, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 12, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 13, .aom_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 14, .aom_sad16x16x3_ssse3 - -.aom_sad16x16x3_ssse3_aligned_by_15: - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.aom_sad16x16x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void int aom_sad16x8x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(aom_sad16x8x3_ssse3) PRIVATE -sym(aom_sad16x8x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .aom_sad16x8x3_ssse3_skiptable -.aom_sad16x8x3_ssse3_jumptable: - dd .aom_sad16x8x3_ssse3_aligned_by_0 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_1 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_2 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_3 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_4 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_5 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_6 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_7 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_8 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_9 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_10 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_11 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_12 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_13 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_14 - .aom_sad16x8x3_ssse3_do_jump - dd .aom_sad16x8x3_ssse3_aligned_by_15 - .aom_sad16x8x3_ssse3_do_jump -.aom_sad16x8x3_ssse3_skiptable: - - call .aom_sad16x8x3_ssse3_do_jump -.aom_sad16x8x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .aom_sad16x8x3_ssse3_jumptable - .aom_sad16x8x3_ssse3_do_jump - add rax, rcx ; get the absolute address of aom_sad16x8x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X8X3_OFFSET 0, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 1, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 2, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 3, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 4, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 5, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 6, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 7, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 8, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 9, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 10, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 11, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 12, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 13, .aom_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 14, .aom_sad16x8x3_ssse3 - -.aom_sad16x8x3_ssse3_aligned_by_15: - - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.aom_sad16x8x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm index aa70106c8..6d9b5a12f 100644 --- a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm @@ -47,6 +47,9 @@ paddd %1, xmm1 SUM_ACROSS_Q %1 %endmacro + +SECTION .text + ;void ssim_parms_sse2( ; unsigned char *s, ; int sp, diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm index d3feb7ec0..45bf6ec3c 100644 --- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm +++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm @@ -117,27 +117,26 @@ SECTION .text ; 11, not 13, if the registers are ordered correctly. May make a minor speed ; difference on Win64 -%ifdef PIC ; 64bit PIC +%if ARCH_X86_64 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse %define sec_str sec_strideq %else - cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse %endif %define block_height heightd %define bilin_filter sseq %else - %if ARCH_X86=1 && CONFIG_PIC=1 + %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse, \ + g_bilin_filter, g_pw_8 %define block_height dword heightm %define sec_str sec_stridemp @@ -155,9 +154,9 @@ SECTION .text LOAD_IF_USED 0, 1 ; load eax, ecx back %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse, \ - g_bilin_filter, g_pw_8 + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse, g_bilin_filter, g_pw_8 %define block_height heightd ;Store bilin_filter and pw_8 location in stack @@ -176,25 +175,18 @@ SECTION .text %endif %else %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse - %if ARCH_X86_64 - %define block_height heightd - %define sec_str sec_strideq - %else + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, sec, sec_stride, \ + height, sse %define block_height dword heightm %define sec_str sec_stridemp - %endif %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse %define block_height heightd %endif - %define bilin_filter bilin_filter_m %endif %endif @@ -374,8 +366,8 @@ SECTION .text .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -383,7 +375,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -400,7 +392,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -697,8 +689,8 @@ SECTION .text .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -706,7 +698,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -723,7 +715,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -855,8 +847,8 @@ SECTION .text jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -864,7 +856,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -881,7 +873,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -997,8 +989,8 @@ SECTION .text jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -1006,7 +998,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -1023,7 +1015,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -1195,8 +1187,8 @@ SECTION .text STORE_AND_RET %1 .x_nonhalf_y_nonhalf: -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift shl y_offsetd, filter_idx_shift @@ -1209,7 +1201,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m11, [bilin_filter+y_offsetq+16] %endif - mova m12, [pw_8] + mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 @@ -1237,7 +1229,7 @@ SECTION .text %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm index 7bd5b23ad..1a75a234f 100644 --- a/third_party/aom/aom_dsp/x86/subtract_sse2.asm +++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm @@ -34,10 +34,8 @@ cglobal subtract_block, 7, 7, 8, \ je .case_16 cmp colsd, 32 je .case_32 -%if CONFIG_EXT_PARTITION cmp colsd, 64 je .case_64 -%endif %macro loop16 6 mova m0, [srcq+%1] @@ -62,7 +60,6 @@ cglobal subtract_block, 7, 7, 8, \ mova [diffq+mmsize*1+%6], m1 %endmacro -%if CONFIG_EXT_PARTITION mov pred_str, pred_stridemp .loop_128: loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize @@ -77,7 +74,6 @@ cglobal subtract_block, 7, 7, 8, \ RET .case_64: -%endif mov pred_str, pred_stridemp .loop_64: loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c index 6be99fbca..a79f22d79 100644 --- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c @@ -14,35 +14,62 @@ #include #include "aom_dsp/x86/synonyms.h" +#include "config/aom_dsp_rtcd.h" -#include "./aom_dsp_rtcd.h" +static INLINE __m128i xx_loadh_64(__m128i a, const void *b) { + const __m128d ad = _mm_castsi128_pd(a); + return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b)); +} + +static INLINE uint64_t xx_cvtsi128_si64(__m128i a) { +#if ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(a); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, a); + return tmp; + } +#endif +} + +static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) { + const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride); + const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride); + const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride); + const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride); + const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w); + const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w); + + return _mm_add_epi32(v_sq_01_d, v_sq_23_d); +} static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { - const __m128i v_val_0_w = - _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); - const __m128i v_val_1_w = - _mm_loadl_epi64((const __m128i *)(src + 1 * stride)); - const __m128i v_val_2_w = - _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); - const __m128i v_val_3_w = - _mm_loadl_epi64((const __m128i *)(src + 3 * stride)); - - const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); - const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); - const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); - const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - - const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); - const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - - const __m128i v_sum_d = + const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride); + __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); - + v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8)); return (uint64_t)_mm_cvtsi128_si32(v_sum_d); } +static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height) { + int r = 0; + __m128i v_acc_q = _mm_setzero_si128(); + do { + const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride); + v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d); + src += stride << 2; + r += 4; + } while (r < height); + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); + __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32), + _mm_and_si128(v_acc_q, v_zext_mask_q)); + v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8)); + return xx_cvtsi128_si64(v_acc_64); +} + #ifdef __GNUC__ // This prevents GCC/Clang from inlining this function into // aom_sum_squares_2d_i16_sse2, which in turn saves some stack @@ -52,72 +79,45 @@ __attribute__((noinline)) static uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, int height) { - int r, c; + int r = 0; - const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); __m128i v_acc_q = _mm_setzero_si128(); - for (r = 0; r < height; r += 8) { + do { __m128i v_acc_d = _mm_setzero_si128(); - - for (c = 0; c < width; c += 8) { + int c = 0; + do { const int16_t *b = src + c; - const __m128i v_val_0_w = - _mm_load_si128((const __m128i *)(b + 0 * stride)); - const __m128i v_val_1_w = - _mm_load_si128((const __m128i *)(b + 1 * stride)); - const __m128i v_val_2_w = - _mm_load_si128((const __m128i *)(b + 2 * stride)); - const __m128i v_val_3_w = - _mm_load_si128((const __m128i *)(b + 3 * stride)); - const __m128i v_val_4_w = - _mm_load_si128((const __m128i *)(b + 4 * stride)); - const __m128i v_val_5_w = - _mm_load_si128((const __m128i *)(b + 5 * stride)); - const __m128i v_val_6_w = - _mm_load_si128((const __m128i *)(b + 6 * stride)); - const __m128i v_val_7_w = - _mm_load_si128((const __m128i *)(b + 7 * stride)); + const __m128i v_val_0_w = xx_load_128(b + 0 * stride); + const __m128i v_val_1_w = xx_load_128(b + 1 * stride); + const __m128i v_val_2_w = xx_load_128(b + 2 * stride); + const __m128i v_val_3_w = xx_load_128(b + 3 * stride); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); - const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); - const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); - const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); - const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); - v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); - } + c += 8; + } while (c < width); v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); - src += 8 * stride; - } + src += 4 * stride; + r += 4; + } while (r < height); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); - -#if ARCH_X86_64 - return (uint64_t)_mm_cvtsi128_si64(v_acc_q); -#else - { - uint64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, v_acc_q); - return tmp; - } -#endif + return xx_cvtsi128_si64(v_acc_q); } uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, @@ -127,7 +127,9 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, // are with size == 4, so it is also the common case. if (LIKELY(width == 4 && height == 4)) { return aom_sum_squares_2d_i16_4x4_sse2(src, stride); - } else if (LIKELY(width % 8 == 0 && height % 8 == 0)) { + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) { // Generic case return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); } else { @@ -140,7 +142,7 @@ uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width, ////////////////////////////////////////////////////////////////////////////// static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { - const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); __m128i v_acc1_q = _mm_setzero_si128(); @@ -185,16 +187,7 @@ static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); - -#if ARCH_X86_64 - return (uint64_t)_mm_cvtsi128_si64(v_acc0_q); -#else - { - uint64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, v_acc0_q); - return tmp; - } -#endif + return xx_cvtsi128_si64(v_acc0_q); } uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) { diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index cd049a454..d9a53fcc5 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -14,7 +14,8 @@ #include -#include "./aom_config.h" +#include "config/aom_config.h" + #include "aom/aom_integer.h" /** @@ -58,6 +59,28 @@ static INLINE void xx_storeu_128(void *const a, const __m128i v) { _mm_storeu_si128((__m128i *)a, v); } +// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm_set_epi64x() +// acting on 32-bit integers. +static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) { +#if defined(_MSC_VER) && _MSC_VER < 1900 + return _mm_set_epi32(0, e1, 0, e0); +#else + return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0); +#endif +} + +// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm_set1_epi64x() +// acting on a 32-bit integer. +static INLINE __m128i xx_set1_64_from_32i(int32_t a) { +#if defined(_MSC_VER) && _MSC_VER < 1900 + return _mm_set_epi32(0, a, 0, a); +#else + return _mm_set1_epi64x((uint32_t)a); +#endif +} + static INLINE __m128i xx_round_epu16(__m128i v_val_w) { return _mm_avg_epu16(v_val_w, _mm_setzero_si128()); } @@ -89,4 +112,12 @@ static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { return _mm_srai_epi32(v_tmp_d, bits); } +static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15); + const __m128i v_tmp_d = + _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi16(v_tmp_d, bits); +} + #endif // AOM_DSP_X86_SYNONYMS_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h new file mode 100644 index 000000000..39f371fc9 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_ +#define AOM_DSP_X86_SYNONYMS_AVX2_H_ + +#include + +#include "config/aom_config.h" + +#include "aom/aom_integer.h" + +/** + * Various reusable shorthands for x86 SIMD intrinsics. + * + * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers. + * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers. + */ + +// Loads and stores to do away with the tedium of casting the address +// to the right type. +static INLINE __m256i yy_load_256(const void *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE __m256i yy_loadu_256(const void *a) { + return _mm256_loadu_si256((const __m256i *)a); +} + +static INLINE void yy_store_256(void *const a, const __m256i v) { + _mm256_store_si256((__m256i *)a, v); +} + +static INLINE void yy_storeu_256(void *const a, const __m256i v) { + _mm256_storeu_si256((__m256i *)a, v); +} + +// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio +// compilers. The following function is equivalent to _mm256_set1_epi64x() +// acting on a 32-bit integer. +static INLINE __m256i yy_set1_64_from_32i(int32_t a) { +#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a); +#else + return _mm256_set1_epi64x((uint32_t)a); +#endif +} + +// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We +// therefore define an equivalent function using a different intrinsic. +// ([ hi ], [ lo ]) -> [ hi ][ lo ] +static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); +} + +#endif // AOM_DSP_X86_SYNONYMS_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h new file mode 100644 index 000000000..f88a1527d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#define AOM_DSP_X86_TRANSPOSE_SSE2_H_ + +#include // SSE2 + +#include "config/aom_config.h" + +static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + return _mm_unpacklo_epi16(a0, a1); +} + +static INLINE void transpose_8bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 8 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); + + // Unpack 16 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi16(a0, a1); + const __m128i b1 = _mm_unpackhi_epi16(a0, a1); + const __m128i b2 = _mm_unpacklo_epi16(a2, a3); + const __m128i b3 = _mm_unpackhi_epi16(a2, a3); + + // Unpack 32 bit elements resulting in: + // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + const __m128i c0 = _mm_unpacklo_epi32(b0, b2); + const __m128i c1 = _mm_unpackhi_epi32(b0, b2); + const __m128i c2 = _mm_unpacklo_epi32(b1, b3); + const __m128i c3 = _mm_unpackhi_epi32(b1, b3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(c0, c0); + out[1] = _mm_unpackhi_epi64(c0, c0); + out[2] = _mm_unpacklo_epi64(c1, c1); + out[3] = _mm_unpackhi_epi64(c1, c1); + out[4] = _mm_unpacklo_epi64(c2, c2); + out[5] = _mm_unpackhi_epi64(c2, c2); + out[6] = _mm_unpacklo_epi64(c3, c3); + out[7] = _mm_unpackhi_epi64(c3, c3); +} + +static INLINE void transpose_16bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi32(a0, a1); + out[1] = _mm_srli_si128(out[0], 8); + out[2] = _mm_unpackhi_epi32(a0, a1); + out[3] = _mm_srli_si128(out[2], 8); +} + +static INLINE void transpose_16bit_4x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 XX XX XX XX + // in[1]: 10 11 12 13 XX XX XX XX + // in[2]: 20 21 22 23 XX XX XX XX + // in[3]: 30 31 32 33 XX XX XX XX + // in[4]: 40 41 42 43 XX XX XX XX + // in[5]: 50 51 52 53 XX XX XX XX + // in[6]: 60 61 62 63 XX XX XX XX + // in[7]: 70 71 72 73 XX XX XX XX + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 02 12 22 32 03 13 23 33 + // b3: 42 52 62 72 43 53 63 73 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpackhi_epi32(a0, a1); + const __m128i b3 = _mm_unpackhi_epi32(a2, a3); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b2, b3); + out[3] = _mm_unpackhi_epi64(b2, b3); +} + +static INLINE void transpose_16bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b2: 04 14 24 34 05 15 25 35 + // b4: 02 12 22 32 03 13 23 33 + // b6: 06 16 26 36 07 17 27 37 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 XX XX XX XX + // out[1]: 01 11 21 31 XX XX XX XX + // out[2]: 02 12 22 32 XX XX XX XX + // out[3]: 03 13 23 33 XX XX XX XX + // out[4]: 04 14 24 34 XX XX XX XX + // out[5]: 05 15 25 35 XX XX XX XX + // out[6]: 06 16 26 36 XX XX XX XX + // out[7]: 07 17 27 37 XX XX XX XX + const __m128i zeros = _mm_setzero_si128(); + out[0] = _mm_unpacklo_epi64(b0, zeros); + out[1] = _mm_unpackhi_epi64(b0, zeros); + out[2] = _mm_unpacklo_epi64(b4, zeros); + out[3] = _mm_unpackhi_epi64(b4, zeros); + out[4] = _mm_unpacklo_epi64(b2, zeros); + out[5] = _mm_unpackhi_epi64(b2, zeros); + out[6] = _mm_unpacklo_epi64(b6, zeros); + out[7] = _mm_unpackhi_epi64(b6, zeros); +} + +static INLINE void transpose_16bit_8x8(const __m128i *const in, + __m128i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 04 05 06 07 + // in[1]: 10 11 12 13 14 15 16 17 + // in[2]: 20 21 22 23 24 25 26 27 + // in[3]: 30 31 32 33 34 35 36 37 + // in[4]: 40 41 42 43 44 45 46 47 + // in[5]: 50 51 52 53 54 55 56 57 + // in[6]: 60 61 62 63 64 65 66 67 + // in[7]: 70 71 72 73 74 75 76 77 + // to: + // a0: 00 10 01 11 02 12 03 13 + // a1: 20 30 21 31 22 32 23 33 + // a2: 40 50 41 51 42 52 43 53 + // a3: 60 70 61 71 62 72 63 73 + // a4: 04 14 05 15 06 16 07 17 + // a5: 24 34 25 35 26 36 27 37 + // a6: 44 54 45 55 46 56 47 57 + // a7: 64 74 65 75 66 76 67 77 + const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); + const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); + const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); + const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); + const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); + const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); + + // Unpack 32 bit elements resulting in: + // b0: 00 10 20 30 01 11 21 31 + // b1: 40 50 60 70 41 51 61 71 + // b2: 04 14 24 34 05 15 25 35 + // b3: 44 54 64 74 45 55 65 75 + // b4: 02 12 22 32 03 13 23 33 + // b5: 42 52 62 72 43 53 63 73 + // b6: 06 16 26 36 07 17 27 37 + // b7: 46 56 66 76 47 57 67 77 + const __m128i b0 = _mm_unpacklo_epi32(a0, a1); + const __m128i b1 = _mm_unpacklo_epi32(a2, a3); + const __m128i b2 = _mm_unpacklo_epi32(a4, a5); + const __m128i b3 = _mm_unpacklo_epi32(a6, a7); + const __m128i b4 = _mm_unpackhi_epi32(a0, a1); + const __m128i b5 = _mm_unpackhi_epi32(a2, a3); + const __m128i b6 = _mm_unpackhi_epi32(a4, a5); + const __m128i b7 = _mm_unpackhi_epi32(a6, a7); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 40 50 60 70 + // out[1]: 01 11 21 31 41 51 61 71 + // out[2]: 02 12 22 32 42 52 62 72 + // out[3]: 03 13 23 33 43 53 63 73 + // out[4]: 04 14 24 34 44 54 64 74 + // out[5]: 05 15 25 35 45 55 65 75 + // out[6]: 06 16 26 36 46 56 66 76 + // out[7]: 07 17 27 37 47 57 67 77 + out[0] = _mm_unpacklo_epi64(b0, b1); + out[1] = _mm_unpackhi_epi64(b0, b1); + out[2] = _mm_unpacklo_epi64(b4, b5); + out[3] = _mm_unpackhi_epi64(b4, b5); + out[4] = _mm_unpacklo_epi64(b2, b3); + out[5] = _mm_unpackhi_epi64(b2, b3); + out[6] = _mm_unpacklo_epi64(b6, b7); + out[7] = _mm_unpackhi_epi64(b6, b7); +} + +// Transpose in-place +static INLINE void transpose_16bit_16x16(__m128i *const left, + __m128i *const right) { + __m128i tbuf[8]; + transpose_16bit_8x8(left, left); + transpose_16bit_8x8(right, tbuf); + transpose_16bit_8x8(left + 8, right); + transpose_16bit_8x8(right + 8, right + 8); + + left[8] = tbuf[0]; + left[9] = tbuf[1]; + left[10] = tbuf[2]; + left[11] = tbuf[3]; + left[12] = tbuf[4]; + left[13] = tbuf[5]; + left[14] = tbuf[6]; + left[15] = tbuf[7]; +} + +static INLINE void transpose_32bit_4x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); +} + +static INLINE void transpose_32bit_4x4x2(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 10 11 12 13 + // in[2]: 20 21 22 23 + // in[3]: 30 31 32 33 + // in[4]: 04 05 06 07 + // in[5]: 14 15 16 17 + // in[6]: 24 25 26 27 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); + const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); + const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); + const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); + const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); + const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +static INLINE void transpose_32bit_8x4(const __m128i *const in, + __m128i *const out) { + // Unpack 32 bit elements. Goes from: + // in[0]: 00 01 02 03 + // in[1]: 04 05 06 07 + // in[2]: 10 11 12 13 + // in[3]: 14 15 16 17 + // in[4]: 20 21 22 23 + // in[5]: 24 25 26 27 + // in[6]: 30 31 32 33 + // in[7]: 34 35 36 37 + // to: + // a0: 00 10 01 11 + // a1: 20 30 21 31 + // a2: 02 12 03 13 + // a3: 22 32 23 33 + // a4: 04 14 05 15 + // a5: 24 34 25 35 + // a6: 06 16 07 17 + // a7: 26 36 27 37 + const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); + const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); + const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); + const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); + const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); + const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); + const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); + const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); + + // Unpack 64 bit elements resulting in: + // out[0]: 00 10 20 30 + // out[1]: 01 11 21 31 + // out[2]: 02 12 22 32 + // out[3]: 03 13 23 33 + // out[4]: 04 14 24 34 + // out[5]: 05 15 25 35 + // out[6]: 06 16 26 36 + // out[7]: 07 17 27 37 + out[0] = _mm_unpacklo_epi64(a0, a1); + out[1] = _mm_unpackhi_epi64(a0, a1); + out[2] = _mm_unpacklo_epi64(a2, a3); + out[3] = _mm_unpackhi_epi64(a2, a3); + out[4] = _mm_unpacklo_epi64(a4, a5); + out[5] = _mm_unpackhi_epi64(a4, a5); + out[6] = _mm_unpacklo_epi64(a6, a7); + out[7] = _mm_unpackhi_epi64(a6, a7); +} + +#endif // AOM_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h deleted file mode 100644 index 1a8fed710..000000000 --- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H -#define AOM_DSP_X86_TXFM_COMMON_AVX2_H - -#include - -#include "aom_dsp/txfm_common.h" -#include "aom_dsp/x86/common_avx2.h" - -#define pair256_set_epi16(a, b) \ - _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ - (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ - (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ - (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) - -#define pair256_set_epi32(a, b) \ - _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \ - (int)(b), (int)(a)) - -static INLINE void mm256_reverse_epi16(__m256i *u) { - const __m256i control = _mm256_set_epi16( - 0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100, - 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E); - __m256i v = _mm256_shuffle_epi8(*u, control); - *u = _mm256_permute2x128_si256(v, v, 1); -} - -static INLINE __m256i butter_fly(const __m256i *a0, const __m256i *a1, - const __m256i *cospi) { - const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i y0 = _mm256_madd_epi16(*a0, *cospi); - __m256i y1 = _mm256_madd_epi16(*a1, *cospi); - - y0 = _mm256_add_epi32(y0, dct_rounding); - y1 = _mm256_add_epi32(y1, dct_rounding); - y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS); - y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS); - - return _mm256_packs_epi32(y0, y1); -} - -static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) { - const __m256i zero = _mm256_setzero_si256(); - const __m256i sqrt2_epi16 = _mm256_set1_epi16(c); - const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING); - __m256i u0, u1; - int i = 0; - - while (i < 16) { - in[i] = _mm256_slli_epi16(in[i], 1); - - u0 = _mm256_unpacklo_epi16(zero, in[i]); - u1 = _mm256_unpackhi_epi16(zero, in[i]); - - u0 = _mm256_madd_epi16(u0, sqrt2_epi16); - u1 = _mm256_madd_epi16(u1, sqrt2_epi16); - - u0 = _mm256_add_epi32(u0, dct_const_rounding); - u1 = _mm256_add_epi32(u1, dct_const_rounding); - - u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS); - u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS); - in[i] = _mm256_packs_epi32(u0, u1); - i++; - } -} - -#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H diff --git a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h b/third_party/aom/aom_dsp/x86/txfm_common_intrin.h deleted file mode 100644 index 4e6eecd32..000000000 --- a/third_party/aom/aom_dsp/x86/txfm_common_intrin.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ -#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ - -// Note: -// This header file should be put below any x86 intrinsics head file - -static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { - if (sizeof(tran_low_t) == 4) { - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_storeu_si128((__m128i *)(dst_ptr), out0); - _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); - } else { - _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); - } -} - -#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h index 4257d8b9c..58a792424 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h @@ -16,17 +16,8 @@ #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" -#define pair_set_epi16(a, b) \ - _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \ - (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a)) - -#define dual_set_epi16(a, b) \ - _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \ - (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a)) - -#define octa_set_epi16(a, b, c, d, e, f, g, h) \ - _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \ - (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h)) +#define pair_set_epi16(a, b) \ + _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))) // Reverse the 8 16 bit words in __m128i static INLINE __m128i mm_reverse_epi16(const __m128i x) { @@ -35,292 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) { return _mm_shuffle_epi32(b, 0x4e); } -#if CONFIG_EXT_TX -// Identity transform (both forward and inverse). -static INLINE void idtx16_8col(__m128i *in) { - const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0); - const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - __m128i y0, y1, y2, y3, y4, y5, y6, y7; - - in[0] = _mm_slli_epi16(in[0], 1); - in[1] = _mm_slli_epi16(in[1], 1); - in[2] = _mm_slli_epi16(in[2], 1); - in[3] = _mm_slli_epi16(in[3], 1); - in[4] = _mm_slli_epi16(in[4], 1); - in[5] = _mm_slli_epi16(in[5], 1); - in[6] = _mm_slli_epi16(in[6], 1); - in[7] = _mm_slli_epi16(in[7], 1); - in[8] = _mm_slli_epi16(in[8], 1); - in[9] = _mm_slli_epi16(in[9], 1); - in[10] = _mm_slli_epi16(in[10], 1); - in[11] = _mm_slli_epi16(in[11], 1); - in[12] = _mm_slli_epi16(in[12], 1); - in[13] = _mm_slli_epi16(in[13], 1); - in[14] = _mm_slli_epi16(in[14], 1); - in[15] = _mm_slli_epi16(in[15], 1); - - v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16); - v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16); - v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16); - v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16); - v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16); - v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16); - v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16); - v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16); - - u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16); - u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16); - u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16); - u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16); - u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16); - u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16); - u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16); - u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16); - - x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16); - x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16); - x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16); - x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16); - x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16); - x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16); - x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16); - x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16); - - y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16); - y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16); - y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16); - y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16); - y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16); - y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16); - y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16); - y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16); - - v0 = _mm_madd_epi16(v0, k__sqrt2_epi16); - v1 = _mm_madd_epi16(v1, k__sqrt2_epi16); - v2 = _mm_madd_epi16(v2, k__sqrt2_epi16); - v3 = _mm_madd_epi16(v3, k__sqrt2_epi16); - v4 = _mm_madd_epi16(v4, k__sqrt2_epi16); - v5 = _mm_madd_epi16(v5, k__sqrt2_epi16); - v6 = _mm_madd_epi16(v6, k__sqrt2_epi16); - v7 = _mm_madd_epi16(v7, k__sqrt2_epi16); - - x0 = _mm_madd_epi16(x0, k__sqrt2_epi16); - x1 = _mm_madd_epi16(x1, k__sqrt2_epi16); - x2 = _mm_madd_epi16(x2, k__sqrt2_epi16); - x3 = _mm_madd_epi16(x3, k__sqrt2_epi16); - x4 = _mm_madd_epi16(x4, k__sqrt2_epi16); - x5 = _mm_madd_epi16(x5, k__sqrt2_epi16); - x6 = _mm_madd_epi16(x6, k__sqrt2_epi16); - x7 = _mm_madd_epi16(x7, k__sqrt2_epi16); - - u0 = _mm_madd_epi16(u0, k__sqrt2_epi16); - u1 = _mm_madd_epi16(u1, k__sqrt2_epi16); - u2 = _mm_madd_epi16(u2, k__sqrt2_epi16); - u3 = _mm_madd_epi16(u3, k__sqrt2_epi16); - u4 = _mm_madd_epi16(u4, k__sqrt2_epi16); - u5 = _mm_madd_epi16(u5, k__sqrt2_epi16); - u6 = _mm_madd_epi16(u6, k__sqrt2_epi16); - u7 = _mm_madd_epi16(u7, k__sqrt2_epi16); - - y0 = _mm_madd_epi16(y0, k__sqrt2_epi16); - y1 = _mm_madd_epi16(y1, k__sqrt2_epi16); - y2 = _mm_madd_epi16(y2, k__sqrt2_epi16); - y3 = _mm_madd_epi16(y3, k__sqrt2_epi16); - y4 = _mm_madd_epi16(y4, k__sqrt2_epi16); - y5 = _mm_madd_epi16(y5, k__sqrt2_epi16); - y6 = _mm_madd_epi16(y6, k__sqrt2_epi16); - y7 = _mm_madd_epi16(y7, k__sqrt2_epi16); - - v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING); - x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING); - x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING); - x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING); - x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING); - x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING); - x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING); - x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING); - - u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - - y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING); - y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING); - y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING); - y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING); - y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING); - y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING); - y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING); - y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - x0 = _mm_srai_epi32(x0, DCT_CONST_BITS); - x1 = _mm_srai_epi32(x1, DCT_CONST_BITS); - x2 = _mm_srai_epi32(x2, DCT_CONST_BITS); - x3 = _mm_srai_epi32(x3, DCT_CONST_BITS); - x4 = _mm_srai_epi32(x4, DCT_CONST_BITS); - x5 = _mm_srai_epi32(x5, DCT_CONST_BITS); - x6 = _mm_srai_epi32(x6, DCT_CONST_BITS); - x7 = _mm_srai_epi32(x7, DCT_CONST_BITS); - - u0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - y0 = _mm_srai_epi32(y0, DCT_CONST_BITS); - y1 = _mm_srai_epi32(y1, DCT_CONST_BITS); - y2 = _mm_srai_epi32(y2, DCT_CONST_BITS); - y3 = _mm_srai_epi32(y3, DCT_CONST_BITS); - y4 = _mm_srai_epi32(y4, DCT_CONST_BITS); - y5 = _mm_srai_epi32(y5, DCT_CONST_BITS); - y6 = _mm_srai_epi32(y6, DCT_CONST_BITS); - y7 = _mm_srai_epi32(y7, DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(v0, x0); - in[1] = _mm_packs_epi32(v1, x1); - in[2] = _mm_packs_epi32(v2, x2); - in[3] = _mm_packs_epi32(v3, x3); - in[4] = _mm_packs_epi32(v4, x4); - in[5] = _mm_packs_epi32(v5, x5); - in[6] = _mm_packs_epi32(v6, x6); - in[7] = _mm_packs_epi32(v7, x7); - - in[8] = _mm_packs_epi32(u0, y0); - in[9] = _mm_packs_epi32(u1, y1); - in[10] = _mm_packs_epi32(u2, y2); - in[11] = _mm_packs_epi32(u3, y3); - in[12] = _mm_packs_epi32(u4, y4); - in[13] = _mm_packs_epi32(u5, y5); - in[14] = _mm_packs_epi32(u6, y6); - in[15] = _mm_packs_epi32(u7, y7); -} -#endif // CONFIG_EXT_TX - -static INLINE void scale_sqrt2_8x4(__m128i *in) { - // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32 - // consecutive elements. - const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2); - - const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); - const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); - const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w); - const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w); - const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w); - const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w); - const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w); - const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w); - - const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w); - const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w); - const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w); - const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w); - const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w); - const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w); - const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w); - const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w); - - in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS)); - in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS)); - in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS)); - in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS)); -} - -static INLINE void scale_sqrt2_8x8(__m128i *in) { - // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)' - // for each element. - const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2); - - const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w); - const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w); - const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w); - const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w); - const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w); - const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w); - const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w); - const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w); - const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w); - const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w); - const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w); - const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w); - const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w); - const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w); - const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w); - const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w); - - const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w); - const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w); - const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w); - const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w); - const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w); - const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w); - const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w); - const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w); - const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w); - const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w); - const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w); - const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w); - const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w); - const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w); - const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w); - const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w); - - in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS)); - in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS)); - in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS)); - in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS)); - in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS)); - in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS)); - in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS)); - in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS), - xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS)); -} - -static INLINE void scale_sqrt2_8x16(__m128i *in) { - scale_sqrt2_8x8(in); - scale_sqrt2_8x8(in + 8); -} - #endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c index 18a70dffe..7d6b7d287 100644 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -10,109 +10,224 @@ */ #include -#include "./aom_dsp_rtcd.h" - -typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum); - -void aom_get32x32var_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum); - -static void variance_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, int w, int h, - unsigned int *sse, int *sum, get_var_avx2 var_fn, - int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += 16) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j], - ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } - } + +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/masked_variance_intrin_ssse3.h" + +static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) { + return _mm_add_epi16(_mm256_castsi256_si128(val), + _mm256_extractf128_si256(val, 1)); } -unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - unsigned int variance; - variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, - aom_get16x16var_avx2, 16); +static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) { + return _mm_add_epi32(_mm256_castsi256_si128(val), + _mm256_extractf128_si256(val, 1)); +} + +static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i adj_sub = _mm256_set1_epi16(0xff01); // (1,-1) + + // unpack into pairs of source and reference values + const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref); + const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref); + + // subtract adjacent elements using src*1 + ref*-1 + const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub); + const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub); + const __m256i madd0 = _mm256_madd_epi16(diff0, diff0); + const __m256i madd1 = _mm256_madd_epi16(diff1, diff1); - variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); - _mm256_zeroupper(); - return variance; + // add to the running totals + *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1)); + *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1)); } -unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); - _mm256_zeroupper(); - return *sse; +static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse); + + // unpack sse and sum registers and add + const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); + const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); + const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); + + // perform the final summation and extract the results + const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); + *((int *)sse) = _mm_cvtsi128_si32(res); + return _mm_extract_epi32(res, 1); +} + +// handle pixels (<= 512) +static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); + const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8)); + const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64); + return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse); +} + +// handle 1024 pixels (32x32, 16x64, 64x16) +static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + // extract the low lane and add it to the high lane + const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum); + const __m128i vsum_64 = + _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128), + _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8))); + return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse); +} + +static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { + const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); + const __m256i sum_hi = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); + return _mm256_add_epi32(sum_lo, sum_hi); +} + +// handle 2048 pixels (32x64, 64x32) +static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum, + unsigned int *const sse) { + vsum = sum_to_32bit_avx2(vsum); + const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); + return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); +} + +static INLINE void variance16_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride)); + const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride)); + const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride)); + const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride)); + const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1); + const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance32_kernel_avx2(const uint8_t *const src, + const uint8_t *const ref, + __m256i *const sse, + __m256i *const sum) { + const __m256i s = _mm256_loadu_si256((__m256i const *)(src)); + const __m256i r = _mm256_loadu_si256((__m256i const *)(ref)); + variance_kernel_avx2(s, r, sse, sum); +} + +static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); + + for (int i = 0; i < h; i += 2) { + variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } } -unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - unsigned int variance; - variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, - aom_get32x32var_avx2, 32); +static INLINE void variance32_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); - variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9); - _mm256_zeroupper(); - return variance; + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src, ref, vsse, vsum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - unsigned int variance; - variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, - aom_get32x32var_avx2, 32); +static INLINE void variance64_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); - variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10); - _mm256_zeroupper(); - return variance; + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - unsigned int variance; - variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, - aom_get32x32var_avx2, 32); +static INLINE void variance128_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + *vsum = _mm256_setzero_si256(); - variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12); - _mm256_zeroupper(); - return variance; + for (int i = 0; i < h; i++) { + variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum); + variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum); + variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum); + variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - unsigned int variance; - variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, - aom_get32x32var_avx2, 32); +#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \ + unsigned int aom_variance##bw##x##bh##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m256i vsse = _mm256_setzero_si256(); \ + __m256i vsum; \ + variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ + const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512); +AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512); +AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512); +AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512); +AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024); + +AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512); +AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512); +AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024); +AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048); - variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11); - _mm256_zeroupper(); - return variance; +AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024); +AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048); + +#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \ + unsigned int aom_variance##bw##x##bh##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m256i vsse = _mm256_setzero_si256(); \ + __m256i vsum = _mm256_setzero_si256(); \ + for (int i = 0; i < (bh / uh); i++) { \ + __m256i vsum16; \ + variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \ + &vsum16); \ + vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \ + src += uh * src_stride; \ + ref += uh * ref_stride; \ + } \ + const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \ + const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \ + return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \ + } + +AOM_VAR_LOOP_AVX2(64, 64, 12, 32); // 64x32 * ( 64/32) +AOM_VAR_LOOP_AVX2(64, 128, 13, 32); // 64x32 * (128/32) +AOM_VAR_LOOP_AVX2(128, 64, 13, 16); // 128x16 * ( 64/16) +AOM_VAR_LOOP_AVX2(128, 128, 14, 16); // 128x16 * (128/16) + +unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse); + return *sse; } unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, @@ -125,68 +240,164 @@ unsigned int aom_sub_pixel_avg_variance32xh_avx2( const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, int height, unsigned int *sseptr); -unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - unsigned int sse1; - const int se1 = aom_sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1); - unsigned int sse2; - const int se2 = - aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset, - dst + 32, dst_stride, 64, &sse2); - const int se = se1 + se2; - unsigned int variance; - *sse = sse1 + sse2; - - variance = *sse - (uint32_t)(((int64_t)se * se) >> 12); - _mm256_zeroupper(); - return variance; -} - -unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src, - int src_stride, int x_offset, - int y_offset, const uint8_t *dst, - int dst_stride, - unsigned int *sse) { - const int se = aom_sub_pixel_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse); - - const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10); - _mm256_zeroupper(); - return variance; -} - -unsigned int aom_sub_pixel_avg_variance64x64_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { - unsigned int sse1; - const int se1 = aom_sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1); - unsigned int sse2; - const int se2 = aom_sub_pixel_avg_variance32xh_avx2( - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32, - 64, 64, &sse2); - const int se = se1 + se2; - unsigned int variance; +#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } + +AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7); +AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6); +AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7); +AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6); +AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5); +AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6); +AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5); +AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4); + +#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \ + } - *sse = sse1 + sse2; +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7); +AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5); +AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4); - variance = *sse - (uint32_t)(((int64_t)se * se) >> 12); - _mm256_zeroupper(); - return variance; +static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); } -unsigned int aom_sub_pixel_avg_variance32x32_avx2( - const uint8_t *src, int src_stride, int x_offset, int y_offset, - const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) { - // Process 32 elements in parallel. - const int se = aom_sub_pixel_avg_variance32xh_avx2( - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse); - - const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10); - _mm256_zeroupper(); - return variance; +static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, + const __m256i a, + uint8_t *comp_pred) { + const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS; + const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits)); + + const __m256i ma = _mm256_sub_epi8(alpha_max, a); + + const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1); + const __m256i aaAL = _mm256_unpacklo_epi8(a, ma); + const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1); + const __m256i aaAH = _mm256_unpackhi_epi8(a, ma); + + const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL); + const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH); + const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset); + const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset); + + const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH); + _mm256_storeu_si256((__m256i *)(comp_pred), roundA); +} + +void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, + int height, const uint8_t *ref, int ref_stride, + const uint8_t *mask, int mask_stride, + int invert_mask) { + int i = 0; + const uint8_t *src0 = invert_mask ? pred : ref; + const uint8_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + if (width == 8) { + comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1, + mask, mask_stride); + } else if (width == 16) { + do { + const __m256i sA0 = mm256_loadu2(src0 + stride0, src0); + const __m256i sA1 = mm256_loadu2(src1 + stride1, src1); + const __m256i aA = mm256_loadu2(mask + mask_stride, mask); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + const __m256i sB0 = mm256_loadu2(src0 + stride0, src0); + const __m256i sB1 = mm256_loadu2(src1 + stride1, src1); + const __m256i aB = mm256_loadu2(mask + mask_stride, mask); + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + // comp_pred's stride == width == 16 + comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); + comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); + comp_pred += (16 << 2); + i += 4; + } while (i < height); + } else { // for width == 32 + do { + const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0)); + const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1)); + const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask)); + + const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0)); + const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1)); + const __m256i aB = + _mm256_lddqu_si256((const __m256i *)(mask + mask_stride)); + + comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred); + comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32); + comp_pred += (32 << 1); + + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + i += 2; + } while (i < height); + } } diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c index 999b541e3..88e27aef3 100644 --- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c @@ -11,7 +11,8 @@ #include // AVX2 -#include "./aom_dsp_rtcd.h" +#include "config/aom_dsp_rtcd.h" + #include "aom_ports/mem.h" /* clang-format off */ @@ -35,203 +36,6 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { }; /* clang-format on */ -void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *SSE, int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i, src_2strides, ref_2strides; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing two strides in a 256 bit register reducing the number - // of loop stride by half (comparing to the sse2 code) - src_2strides = source_stride << 1; - ref_2strides = recon_stride << 1; - for (i = 0; i < 8; i++) { - src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr))); - src = _mm256_inserti128_si256( - src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1); - - ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr))); - ref = _mm256_inserti128_si256( - ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = - _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - - src_ptr += src_2strides; - ref_ptr += ref_2strides; - } - - { - __m128i sum_res, madd_res; - __m128i expand_sum_low, expand_sum_high, expand_sum; - __m128i expand_madd_low, expand_madd_high, expand_madd; - __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // extract the low lane and add it to the high lane - sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), - _mm256_extractf128_si256(sum_ref_src, 1)); - - madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), - _mm256_extractf128_si256(madd_ref_src, 1)); - - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = - _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - expand_sum_high = - _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res); - - // shifting the sign 16 bits right - expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); - - expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); - - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = - _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - expand_madd_high = - _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg)); - - expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); - - ex_expand_sum_low = - _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - ex_expand_sum_high = - _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg)); - - ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - - // shift 8 bytes eight - madd_res = _mm_srli_si128(expand_madd, 8); - sum_res = _mm_srli_si128(ex_expand_sum, 8); - - madd_res = _mm_add_epi32(madd_res, expand_madd); - sum_res = _mm_add_epi32(sum_res, ex_expand_sum); - - *((int *)SSE) = _mm_cvtsi128_si32(madd_res); - - *((int *)Sum) = _mm_cvtsi128_si32(sum_res); - } - _mm256_zeroupper(); -} - -void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - unsigned int *SSE, int *Sum) { - __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; - __m256i ref_expand_high, madd_low, madd_high; - unsigned int i; - __m256i zero_reg = _mm256_set1_epi16(0); - __m256i sum_ref_src = _mm256_set1_epi16(0); - __m256i madd_ref_src = _mm256_set1_epi16(0); - - // processing 32 elements in parallel - for (i = 0; i < 16; i++) { - src = _mm256_loadu_si256((__m256i const *)(src_ptr)); - - ref = _mm256_loadu_si256((__m256i const *)(ref_ptr)); - - // expanding to 16 bit each lane - src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); - src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); - - ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); - ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); - - // src-ref - src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); - src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); - - // madd low (src - ref) - madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); - - // add high to low - src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); - - // madd high (src - ref) - madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); - - sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); - - // add high to low - madd_ref_src = - _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high)); - - src_ptr += source_stride; - ref_ptr += recon_stride; - } - - { - __m256i expand_sum_low, expand_sum_high, expand_sum; - __m256i expand_madd_low, expand_madd_high, expand_madd; - __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; - - // padding each 2 bytes with another 2 zeroed bytes - expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); - expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); - - // shifting the sign 16 bits right - expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); - expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); - - expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); - - // expand each 32 bits of the madd result to 64 bits - expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); - expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); - - expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); - - ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); - ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); - - ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); - - // shift 8 bytes eight - madd_ref_src = _mm256_srli_si256(expand_madd, 8); - sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); - - madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); - sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); - - // extract the low lane and the high lane and add the results - *((int *)SSE) = - _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); - - *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + - _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); - } - _mm256_zeroupper(); -} - #define FILTER_SRC(filter) \ /* filter the source */ \ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index 211fad3f8..c8c90a7dc 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -12,24 +12,24 @@ #include #include // SSE2 -#include "./aom_config.h" -#include "./aom_dsp_rtcd.h" +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" -#include "./av1_rtcd.h" #include "av1/common/filter.h" - -typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse, int *sum); +#include "av1/common/onyxc_int.h" +#include "av1/common/reconinter.h" unsigned int aom_get_mb_ss_sse2(const int16_t *src) { __m128i vsum = _mm_setzero_si128(); int i; for (i = 0; i < 32; ++i) { - const __m128i v = _mm_loadu_si128((const __m128i *)src); + const __m128i v = xx_loadu_128(src); vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); src += 8; } @@ -39,276 +39,265 @@ unsigned int aom_get_mb_ss_sse2(const int16_t *src) { return _mm_cvtsi128_si32(vsum); } -#define READ64(p, stride, i) \ - _mm_unpacklo_epi8( \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \ - _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride))) +static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) { + const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride)); + const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride)); + return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128()); +} -static void get4x4var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero); - const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero); - const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero); - const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - // sum - __m128i vsum = _mm_add_epi16(diff0, diff1); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); - *sum = (int16_t)_mm_extract_epi16(vsum, 0); +static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) { + const __m128i p0 = _mm_loadl_epi64((const __m128i *)p); + return _mm_unpacklo_epi8(p0, _mm_setzero_si128()); +} - // sse - vsum = - _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); - *sse = _mm_cvtsi128_si32(vsum); +// Accumulate 4 32bit numbers in val to 1 32bit number +static INLINE unsigned int add32x4_sse2(__m128i val) { + val = _mm_add_epi32(val, _mm_srli_si128(val, 8)); + val = _mm_add_epi32(val, _mm_srli_si128(val, 4)); + return _mm_cvtsi128_si32(val); } -void aom_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, - int ref_stride, unsigned int *sse, int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; +// Accumulate 8 16bit in sum to 4 32bit number +static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) { + const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16); + const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16); + return _mm_add_epi32(sum_lo, sum_hi); +} - for (i = 0; i < 8; i += 2) { - const __m128i src0 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero); - const __m128i ref0 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); - - const __m128i src1 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero); - const __m128i ref1 = _mm_unpacklo_epi8( - _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); - - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); - } +static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref, + __m128i *const sse, + __m128i *const sum) { + const __m128i diff = _mm_sub_epi16(src, ref); + *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff)); + *sum = _mm_add_epi16(*sum, diff); +} + +// Can handle 128 pixels' diff sum (such as 8x16 or 16x8) +// Slightly faster than variance_final_256_pel_sse2() +// diff sum of 128 pixels can still fit in 16bit integer +static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - // sum vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); - - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); } -void aom_get16x16var_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, unsigned int *sse, - int *sum) { - const __m128i zero = _mm_setzero_si128(); - __m128i vsum = _mm_setzero_si128(); - __m128i vsse = _mm_setzero_si128(); - int i; +// Can handle 256 pixels' diff sum (such as 16x16) +static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - for (i = 0; i < 16; ++i) { - const __m128i s = _mm_loadu_si128((const __m128i *)src); - const __m128i r = _mm_loadu_si128((const __m128i *)ref); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); + *sum = (int16_t)_mm_extract_epi16(vsum, 0); + *sum += (int16_t)_mm_extract_epi16(vsum, 1); +} - const __m128i src0 = _mm_unpacklo_epi8(s, zero); - const __m128i ref0 = _mm_unpacklo_epi8(r, zero); - const __m128i diff0 = _mm_sub_epi16(src0, ref0); +// Can handle 512 pixels' diff sum (such as 16x32 or 32x16) +static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - const __m128i src1 = _mm_unpackhi_epi8(s, zero); - const __m128i ref1 = _mm_unpackhi_epi8(r, zero); - const __m128i diff1 = _mm_sub_epi16(src1, ref1); + vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); + vsum = _mm_unpacklo_epi16(vsum, vsum); + vsum = _mm_srai_epi32(vsum, 16); + *sum = add32x4_sse2(vsum); +} - vsum = _mm_add_epi16(vsum, diff0); - vsum = _mm_add_epi16(vsum, diff1); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); - vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); +// Can handle 1024 pixels' diff sum (such as 32x32) +static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum, + unsigned int *const sse, + int *const sum) { + *sse = add32x4_sse2(vsse); - src += src_stride; - ref += ref_stride; - } + vsum = sum_to_32bit_sse2(vsum); + *sum = add32x4_sse2(vsum); +} - // sum - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); - vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); - *sum = - (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1); +static INLINE void variance4_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 256); // May overflow for larger height. + *sum = _mm_setzero_si128(); - // sse - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); - vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); - *sse = _mm_cvtsi128_si32(vsse); -} + for (int i = 0; i < h; i += 2) { + const __m128i s = load4x2_sse2(src, src_stride); + const __m128i r = load4x2_sse2(ref, ref_stride); -static void variance_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, int w, - int h, unsigned int *sse, int *sum, - getNxMvar_fn_t var_fn, int block_size) { - int i, j; - - *sse = 0; - *sum = 0; - - for (i = 0; i < h; i += block_size) { - for (j = 0; j < w; j += block_size) { - unsigned int sse0; - int sum0; - var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, - ref_stride, &sse0, &sum0); - *sse += sse0; - *sum += sum0; - } + variance_kernel_sse2(s, r, sse, sum); + src += 2 * src_stride; + ref += 2 * ref_stride; } } -unsigned int aom_variance4x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - assert(sum <= 255 * 4 * 4); - assert(sum >= -255 * 4 * 4); - return *sse - ((sum * sum) >> 4); -} - -unsigned int aom_variance8x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum, - get4x4var_sse2, 4); - assert(sum <= 255 * 8 * 4); - assert(sum >= -255 * 8 * 4); - return *sse - ((sum * sum) >> 5); +static INLINE void variance8_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 128); // May overflow for larger height. + *sum = _mm_setzero_si128(); + for (int i = 0; i < h; i++) { + const __m128i s = load8_8to16_sse2(src); + const __m128i r = load8_8to16_sse2(ref); + + variance_kernel_sse2(s, r, sse, sum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance4x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum, - get4x4var_sse2, 4); - assert(sum <= 255 * 8 * 4); - assert(sum >= -255 * 8 * 4); - return *sse - ((sum * sum) >> 5); +static INLINE void variance16_kernel_sse2(const uint8_t *const src, + const uint8_t *const ref, + __m128i *const sse, + __m128i *const sum) { + const __m128i zero = _mm_setzero_si128(); + const __m128i s = _mm_loadu_si128((const __m128i *)src); + const __m128i r = _mm_loadu_si128((const __m128i *)ref); + const __m128i src0 = _mm_unpacklo_epi8(s, zero); + const __m128i ref0 = _mm_unpacklo_epi8(r, zero); + const __m128i src1 = _mm_unpackhi_epi8(s, zero); + const __m128i ref1 = _mm_unpackhi_epi8(r, zero); + + variance_kernel_sse2(src0, ref0, sse, sum); + variance_kernel_sse2(src1, ref1, sse, sum); } -unsigned int aom_variance8x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - aom_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - assert(sum <= 255 * 8 * 8); - assert(sum >= -255 * 8 * 8); - return *sse - ((sum * sum) >> 6); -} +static INLINE void variance16_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 64); // May overflow for larger height. + *sum = _mm_setzero_si128(); -unsigned int aom_variance16x8_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum, - aom_get8x8var_sse2, 8); - assert(sum <= 255 * 16 * 8); - assert(sum >= -255 * 16 * 8); - return *sse - ((sum * sum) >> 7); + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src, ref, sse, sum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance8x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum, - aom_get8x8var_sse2, 8); - assert(sum <= 255 * 16 * 8); - assert(sum >= -255 * 16 * 8); - return *sse - ((sum * sum) >> 7); +static INLINE void variance32_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 32); // May overflow for larger height. + // Don't initialize sse here since it's an accumulation. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src + 0, ref + 0, sse, sum); + variance16_kernel_sse2(src + 16, ref + 16, sse, sum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance16x16_sse2(const unsigned char *src, int src_stride, - const unsigned char *ref, int ref_stride, - unsigned int *sse) { - int sum; - aom_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum); - assert(sum <= 255 * 16 * 16); - assert(sum >= -255 * 16 * 16); - return *sse - ((uint32_t)((int64_t)sum * sum) >> 8); +static INLINE void variance64_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 16); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + variance16_kernel_sse2(src + 0, ref + 0, sse, sum); + variance16_kernel_sse2(src + 16, ref + 16, sse, sum); + variance16_kernel_sse2(src + 32, ref + 32, sse, sum); + variance16_kernel_sse2(src + 48, ref + 48, sse, sum); + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance32x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 32 * 32); - assert(sum >= -255 * 32 * 32); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); +static INLINE void variance128_sse2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m128i *const sse, + __m128i *const sum) { + assert(h <= 8); // May overflow for larger height. + *sum = _mm_setzero_si128(); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < 4; ++j) { + const int offset0 = j << 5; + const int offset1 = offset0 + 16; + variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum); + variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum); + } + src += src_stride; + ref += ref_stride; + } } -unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 32 * 16); - assert(sum >= -255 * 32 * 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); -} +#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \ + unsigned int aom_variance##bw##x##bh##_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m128i vsse = _mm_setzero_si128(); \ + __m128i vsum; \ + int sum = 0; \ + variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \ + variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \ + assert(sum <= 255 * bw * bh); \ + assert(sum >= -255 * bw * bh); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } -unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 32 * 16); - assert(sum >= -255 * 32 * 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 9); -} +AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128); +AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128); +AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128); + +AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128); +AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128); +AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128); +AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256); + +AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128); +AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128); +AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256); +AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512); +AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024); + +AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256); +AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512); +AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024); + +#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh) \ + unsigned int aom_variance##bw##x##bh##_sse2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + unsigned int *sse) { \ + __m128i vsse = _mm_setzero_si128(); \ + __m128i vsum = _mm_setzero_si128(); \ + for (int i = 0; i < (bh / uh); ++i) { \ + __m128i vsum16; \ + variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse, \ + &vsum16); \ + vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); \ + src += (src_stride * uh); \ + ref += (ref_stride * uh); \ + } \ + *sse = add32x4_sse2(vsse); \ + int sum = add32x4_sse2(vsum); \ + assert(sum <= 255 * bw * bh); \ + assert(sum >= -255 * bw * bh); \ + return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \ + } -unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 64 * 64); - assert(sum >= -255 * 64 * 64); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 12); -} +AOM_VAR_LOOP_SSE2(32, 64, 11, 32); // 32x32 * ( 64/32 ) -unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 64 * 32); - assert(sum >= -255 * 64 * 32); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); -} +AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024); +AOM_VAR_LOOP_SSE2(64, 32, 11, 16); // 64x16 * ( 32/16 ) +AOM_VAR_LOOP_SSE2(64, 64, 12, 16); // 64x16 * ( 64/16 ) +AOM_VAR_LOOP_SSE2(64, 128, 13, 16); // 64x16 * ( 128/16 ) -unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 64 * 32); - assert(sum >= -255 * 64 * 32); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 11); -} +AOM_VAR_LOOP_SSE2(128, 64, 13, 8); // 128x8 * ( 64/8 ) +AOM_VAR_LOOP_SSE2(128, 128, 14, 8); // 128x8 * ( 128/8 ) unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, @@ -338,74 +327,6 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, return *sse; } -#if CONFIG_EXT_PARTITION_TYPES -unsigned int aom_variance4x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 4, 16, sse, &sum, - get4x4var_sse2, 4); - assert(sum <= 255 * 4 * 16); - assert(sum >= -255 * 4 * 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 6); -} - -unsigned int aom_variance16x4_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 4, sse, &sum, - get4x4var_sse2, 4); - assert(sum <= 255 * 16 * 4); - assert(sum >= -255 * 16 * 4); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 6); -} - -unsigned int aom_variance8x32_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 8, 32, sse, &sum, - aom_get8x8var_sse2, 8); - assert(sum <= 255 * 8 * 32); - assert(sum >= -255 * 8 * 32); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); -} - -unsigned int aom_variance32x8_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 32, 8, sse, &sum, - aom_get8x8var_sse2, 8); - assert(sum <= 255 * 32 * 8); - assert(sum >= -255 * 32 * 8); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 8); -} - -unsigned int aom_variance16x64_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 16, 64, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 16 * 64); - assert(sum >= -255 * 16 * 64); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); -} - -unsigned int aom_variance64x16_sse2(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride, - unsigned int *sse) { - int sum; - variance_sse2(src, src_stride, ref, ref_stride, 64, 16, sse, &sum, - aom_get16x16var_sse2, 16); - assert(sum <= 255 * 64 * 16); - assert(sum >= -255 * 64 * 16); - return *sse - (unsigned int)(((int64_t)sum * sum) >> 10); -} -#endif - // The 2 unused parameters are place holders for PIC enabled build. // These definitions are for functions defined in subpel_variance.asm #define DECL(w, opt) \ @@ -423,75 +344,57 @@ DECLS(ssse3); #undef DECLS #undef DECL -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ - unsigned int sse; \ - int se = aom_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ - y_offset, dst, dst_stride, \ - h, &sse, NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = aom_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sse_ptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \ + &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#if CONFIG_EXT_PARTITION_TYPES -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ - FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ - FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ - FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ - FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \ - FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \ - FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t)) -#else -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) -#endif +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) FNS(sse2); FNS(ssse3); @@ -516,76 +419,61 @@ DECLS(ssse3); #undef DECL #undef DECLS -#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ - unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ - const uint8_t *src, int src_stride, int x_offset, int y_offset, \ - const uint8_t *dst, int dst_stride, unsigned int *sseptr, \ - const uint8_t *sec) { \ - unsigned int sse; \ - int se = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ - NULL, NULL); \ - if (w > wf) { \ - unsigned int sse2; \ - int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ - sec + 16, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ - sec + 32, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ - sec + 48, w, h, &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ - } \ - } \ - *sseptr = sse; \ - return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ +#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \ + unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src, int src_stride, int x_offset, int y_offset, \ + const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \ + const uint8_t *sec) { \ + /*Avoid overflow in helper by capping height.*/ \ + const int hf = AOMMIN(h, 64); \ + unsigned int sse = 0; \ + int se = 0; \ + for (int i = 0; i < (w / wf); ++i) { \ + const uint8_t *src_ptr = src; \ + const uint8_t *dst_ptr = dst; \ + const uint8_t *sec_ptr = sec; \ + for (int j = 0; j < (h / hf); ++j) { \ + unsigned int sse2; \ + const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \ + src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ + sec_ptr, w, hf, &sse2, NULL, NULL); \ + dst_ptr += hf * dst_stride; \ + src_ptr += hf * src_stride; \ + sec_ptr += hf * w; \ + se += se2; \ + sse += sse2; \ + } \ + src += wf; \ + dst += wf; \ + sec += wf; \ + } \ + *sse_ptr = sse; \ + return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \ } -#if CONFIG_EXT_PARTITION_TYPES -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ - FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ - FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ - FN(8, 32, 8, 3, 5, opt, (int32_t), (int32_t)); \ - FN(32, 8, 16, 5, 3, opt, (int32_t), (int32_t)); \ - FN(16, 64, 16, 4, 6, opt, (int32_t), (int32_t)); \ - FN(64, 16, 16, 6, 4, opt, (int32_t), (int32_t)) -#else -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ - FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ - FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ - FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ - FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) -#endif +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \ + FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \ + FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \ + FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \ + FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \ + FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \ + FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \ + FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t)) FNS(sse2); FNS(ssse3); @@ -593,9 +481,97 @@ FNS(ssse3); #undef FNS #undef FN -void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, +void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride) { + // expect xd == NULL only in tests + if (xd != NULL) { + const MB_MODE_INFO *mi = xd->mi[0]; + const int ref_num = 0; + const int is_intrabc = is_intrabc_block(mi); + const struct scale_factors *const sf = + is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + const int is_scaled = av1_is_scaled(sf); + + if (is_scaled) { + // Note: This is mostly a copy from the >=8X8 case in + // build_inter_predictors() function, with some small tweaks. + + // Some assumptions. + const int plane = 0; + + // Get pre-requisites. + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int ssx = pd->subsampling_x; + const int ssy = pd->subsampling_y; + assert(ssx == 0 && ssy == 0); + const struct buf_2d *const dst_buf = &pd->dst; + const struct buf_2d *const pre_buf = + is_intrabc ? dst_buf : &pd->pre[ref_num]; + const int mi_x = mi_col * MI_SIZE; + const int mi_y = mi_row * MI_SIZE; + + // Calculate subpel_x/y and x/y_step. + const int row_start = 0; // Because ss_y is 0. + const int col_start = 0; // Because ss_x is 0. + const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx; + const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy; + int orig_pos_y = pre_y << SUBPEL_BITS; + orig_pos_y += mv->row * (1 << (1 - ssy)); + int orig_pos_x = pre_x << SUBPEL_BITS; + orig_pos_x += mv->col * (1 << (1 - ssx)); + int pos_y = sf->scale_value_y(orig_pos_y, sf); + int pos_x = sf->scale_value_x(orig_pos_x, sf); + pos_x += SCALE_EXTRA_OFF; + pos_y += SCALE_EXTRA_OFF; + + const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy); + const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx); + const int bottom = (pre_buf->height + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + const int right = (pre_buf->width + AOM_INTERP_EXTEND) + << SCALE_SUBPEL_BITS; + pos_y = clamp(pos_y, top, bottom); + pos_x = clamp(pos_x, left, right); + + const uint8_t *const pre = + pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride + + (pos_x >> SCALE_SUBPEL_BITS); + + const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4, + pos_x & SCALE_SUBPEL_MASK, + pos_y & SCALE_SUBPEL_MASK }; + + // Get warp types. + const WarpedMotionParams *const wm = + &xd->global_motion[mi->ref_frame[ref_num]]; + const int is_global = is_global_mv_block(mi, wm->wmtype); + WarpTypesAllowed warp_types; + warp_types.global_warp_allowed = is_global; + warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; + + // Get convolve parameters. + ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + const InterpFilters filters = + av1_broadcast_interp_filter(EIGHTTAP_REGULAR); + + // Get the inter predictor. + const int build_for_obmc = 0; + av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width, + &subpel_params, sf, width, height, &conv_params, + filters, &warp_types, mi_x >> pd->subsampling_x, + mi_y >> pd->subsampling_y, plane, ref_num, mi, + build_for_obmc, xd, cm->allow_warped_motion); + + return; + } + } + + const InterpFilterParams filter = + av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + if (!subpel_x_q3 && !subpel_y_q3) { if (width >= 16) { int i; @@ -604,8 +580,7 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, for (i = 0; i < height; i++) { int j; for (j = 0; j < width; j += 16) { - __m128i s0 = _mm_loadu_si128((const __m128i *)ref); - _mm_storeu_si128((__m128i *)comp_pred, s0); + xx_storeu_128(comp_pred, xx_loadu_128(ref)); comp_pred += 16; ref += 16; } @@ -617,10 +592,9 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, assert(!(height & 1)); /*Read 8 pixels two rows at a time.*/ for (i = 0; i < height; i += 2) { - __m128i s0 = _mm_loadl_epi64((const __m128i *)ref); - __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride)); - __m128i t0 = _mm_unpacklo_epi64(s0, s1); - _mm_storeu_si128((__m128i *)comp_pred, t0); + __m128i s0 = xx_loadl_64(ref + 0 * ref_stride); + __m128i s1 = xx_loadl_64(ref + 1 * ref_stride); + xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1)); comp_pred += 16; ref += 2 * ref_stride; } @@ -630,69 +604,62 @@ void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height, assert(!(height & 3)); /*Read 4 pixels four rows at a time.*/ for (i = 0; i < height; i++) { - __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); - __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + ref_stride)); - __m128i s2 = - _mm_cvtsi32_si128(*(const uint32_t *)(ref + 2 * ref_stride)); - __m128i s3 = - _mm_cvtsi32_si128(*(const uint32_t *)(ref + 3 * ref_stride)); - __m128i t0 = _mm_unpacklo_epi32(s0, s1); - __m128i t1 = _mm_unpacklo_epi32(s2, s3); - __m128i u0 = _mm_unpacklo_epi64(t0, t1); - _mm_storeu_si128((__m128i *)comp_pred, u0); + const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride); + const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride); + const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride); + const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride); + const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1), + _mm_unpacklo_epi32(row2, row3)); + xx_storeu_128(comp_pred, reg); comp_pred += 16; ref += 4 * ref_stride; } } + } else if (!subpel_y_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, + width, height); + } else if (!subpel_x_q3) { + const int16_t *const kernel = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, + width, height); } else { - InterpFilterParams filter; - filter = av1_get_interp_filter_params(EIGHTTAP_REGULAR); - if (!subpel_y_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, - -1, width, height); - } else if (!subpel_x_q3) { - const int16_t *kernel; - kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, - 16, width, height); - } else { - DECLARE_ALIGNED(16, uint8_t, - temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); - const int16_t *kernel_x; - const int16_t *kernel_y; - int intermediate_height; - kernel_x = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; - assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), - ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, - width, intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), - MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, - width, height); - } + DECLARE_ALIGNED(16, uint8_t, + temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); + const int16_t *const kernel_x = + av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); + const int16_t *const kernel_y = + av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); + const int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); + aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride, + temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, + intermediate_height); + aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), + MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, + width, height); } } -void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref, - int ref_stride) { +void aom_comp_avg_upsampled_pred_sse2( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride) { int n; int i; - aom_upsampled_pred(comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref, - ref_stride); + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride); /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ assert(!(width * height & 15)); n = width * height >> 4; for (i = 0; i < n; i++) { - __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred); - __m128i p0 = _mm_loadu_si128((const __m128i *)pred); - _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu8(s0, p0)); + __m128i s0 = xx_loadu_128(comp_pred); + __m128i p0 = xx_loadu_128(pred); + xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0)); comp_pred += 16; pred += 16; } -- cgit v1.2.3 From b8df135c97a854c2ff9b4394b016649c601177fa Mon Sep 17 00:00:00 2001 From: trav90 Date: Fri, 19 Oct 2018 23:00:02 -0500 Subject: Update libaom to rev b25610052a1398032320008d69b51d2da94f5928 --- third_party/aom/aom_dsp/aom_dsp.cmake | 8 +- third_party/aom/aom_dsp/aom_dsp_rtcd.c | 2 +- third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl | 23 +- third_party/aom/aom_dsp/arm/intrapred_neon.c | 60 ++ third_party/aom/aom_dsp/arm/loopfilter_neon.c | 228 ++++++++ third_party/aom/aom_dsp/bitreader_buffer.c | 7 +- third_party/aom/aom_dsp/bitreader_buffer.h | 2 +- third_party/aom/aom_dsp/bitwriter_buffer.c | 3 + third_party/aom/aom_dsp/grain_synthesis.c | 40 +- third_party/aom/aom_dsp/grain_synthesis.h | 10 +- third_party/aom/aom_dsp/noise_model.c | 186 +++++++ third_party/aom/aom_dsp/noise_model.h | 37 ++ third_party/aom/aom_dsp/simd/v256_intrinsics.h | 9 + third_party/aom/aom_dsp/variance.c | 39 +- third_party/aom/aom_dsp/variance.h | 7 + .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c | 619 ++++++++++++++++----- third_party/aom/aom_dsp/x86/convolve_avx2.h | 32 +- third_party/aom/aom_dsp/x86/convolve_sse2.h | 2 +- third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c | 8 +- .../aom/aom_dsp/x86/highbd_convolve_ssse3.c | 8 +- third_party/aom/aom_dsp/x86/highbd_variance_sse2.c | 8 +- third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c | 110 +--- .../aom/aom_dsp/x86/masked_sad_intrin_avx2.c | 390 +++++++++++++ .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.c | 70 ++- .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.h | 33 ++ third_party/aom/aom_dsp/x86/obmc_sad_avx2.c | 270 +++++++++ third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | 50 ++ third_party/aom/aom_dsp/x86/subtract_avx2.c | 108 ++++ third_party/aom/aom_dsp/x86/txfm_common_avx2.h | 199 +++++++ third_party/aom/aom_dsp/x86/variance_avx2.c | 113 ++++ third_party/aom/aom_dsp/x86/variance_impl_ssse3.c | 129 +++++ third_party/aom/aom_dsp/x86/variance_sse2.c | 12 +- 32 files changed, 2435 insertions(+), 387 deletions(-) create mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h create mode 100644 third_party/aom/aom_dsp/x86/obmc_sad_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/subtract_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/txfm_common_avx2.h create mode 100644 third_party/aom/aom_dsp/x86/variance_impl_ssse3.c (limited to 'third_party/aom/aom_dsp') diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index 768875f7d..7c0111a69 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -83,6 +83,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1 list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c" "${AOM_ROOT}/aom_dsp/x86/common_avx2.h" + "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h" "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h" "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" @@ -190,13 +191,16 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm") list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c") + "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c") list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") @@ -205,9 +209,11 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm") list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h" "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h" "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c") diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c index 5d7d4515b..1514bd64e 100644 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd.c +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c @@ -15,4 +15,4 @@ #include "aom_ports/aom_once.h" -void aom_dsp_rtcd() { once(setup_rtcd_internal); } +void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); } diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index a8ac5eb5c..1a9ac3184 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -377,7 +377,7 @@ add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8 specialize qw/aom_lpf_vertical_14_dual sse2/; add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_6 sse2/; +specialize qw/aom_lpf_vertical_6 sse2 neon/; add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; specialize qw/aom_lpf_vertical_8 sse2 neon/; @@ -386,13 +386,13 @@ add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_ specialize qw/aom_lpf_vertical_8_dual sse2/; add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_vertical_4 sse2/; +specialize qw/aom_lpf_vertical_4 sse2 neon/; add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_vertical_4_dual sse2/; add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_14 sse2/; +specialize qw/aom_lpf_horizontal_14 sse2 neon/; add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_horizontal_14_dual sse2/; @@ -410,7 +410,7 @@ add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint specialize qw/aom_lpf_horizontal_8_dual sse2/; add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"; -specialize qw/aom_lpf_horizontal_4 sse2/; +specialize qw/aom_lpf_horizontal_4 sse2 neon/; add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"; specialize qw/aom_lpf_horizontal_4_dual sse2/; @@ -564,7 +564,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # Block subtraction # add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; - specialize qw/aom_subtract_block neon msa sse2/; + specialize qw/aom_subtract_block neon msa sse2 avx2/; add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; specialize qw/aom_highbd_subtract_block sse2/; @@ -732,14 +732,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask"; - specialize "aom_masked_sad${w}x${h}", qw/ssse3/; + specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/; } foreach (@block_sizes) { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask"; - specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/; + specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/; } @@ -750,7 +750,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { - specialize "aom_obmc_sad${w}x${h}", qw/sse4_1/; + specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/; } } @@ -759,7 +759,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask"; if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) { - specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1/; + specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/; } } @@ -1102,6 +1102,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; specialize "aom_obmc_variance${w}x${h}", q/sse4_1/; + specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/; } @@ -1539,9 +1540,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_comp_mask_pred ssse3 avx2/; add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - add_proto qw/void aom_highbd_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd"; - + specialize qw/aom_highbd_comp_mask_pred avx2/; } # CONFIG_AV1_ENCODER diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c index 69470eeb0..c85b1e910 100644 --- a/third_party/aom/aom_dsp/arm/intrapred_neon.c +++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c @@ -528,3 +528,63 @@ void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, } } } + +static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw, + const uint16_t *above, + const uint16_t *left) { + assert(bw >= 4); + assert(IS_POWER_OF_TWO(bw)); + int expected_dc, sum = 0; + const int count = bw * 2; + uint32x4_t sum_q = vdupq_n_u32(0); + uint32x2_t sum_d; + uint16_t *dst_1; + if (bw >= 8) { + for (int i = 0; i < bw; i += 8) { + sum_q = vpadalq_u16(sum_q, vld1q_u16(above)); + sum_q = vpadalq_u16(sum_q, vld1q_u16(left)); + above += 8; + left += 8; + } + sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); + sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); + expected_dc = (sum + (count >> 1)) / count; + const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc); + for (int r = 0; r < bw; r++) { + dst_1 = dst; + for (int i = 0; i < bw; i += 8) { + vst1q_u16(dst_1, dc); + dst_1 += 8; + } + dst += stride; + } + } else { // 4x4 + sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left)); + sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q)); + sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0); + expected_dc = (sum + (count >> 1)) / count; + const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc); + for (int r = 0; r < bw; r++) { + vst1_u16(dst, dc); + dst += stride; + } + } +} + +#define intra_pred_highbd_sized_neon(type, width) \ + void aom_highbd_##type##_predictor_##width##x##width##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + (void)bd; \ + highbd_##type##_predictor(dst, stride, width, above, left); \ + } + +#define intra_pred_square(type) \ + intra_pred_highbd_sized_neon(type, 4); \ + intra_pred_highbd_sized_neon(type, 8); \ + intra_pred_highbd_sized_neon(type, 16); \ + intra_pred_highbd_sized_neon(type, 32); \ + intra_pred_highbd_sized_neon(type, 64); + +intra_pred_square(dc); +#undef intra_pred_square diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c index ee1a3c78f..bdc67626d 100644 --- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c +++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c @@ -52,6 +52,36 @@ static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, return mask_8x8; } +static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0, + const uint8_t blimit, const uint8_t limit) { + uint32x2x2_t p0q0_p1q1; + uint16x8_t temp_16x8; + uint16x4_t temp0_16x4, temp1_16x4; + const uint16x4_t blimit_16x4 = vdup_n_u16(blimit); + const uint8x8_t limit_8x8 = vdup_n_u8(limit); + uint8x8_t mask_8x8, temp_8x8; + + mask_8x8 = vabd_u8(p1q1, p0q0); + mask_8x8 = vcle_u8(mask_8x8, limit_8x8); + + temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8))); + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1)); + temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]), + vreinterpret_u8_u32(p0q0_p1q1.val[1])); + temp_16x8 = vmovl_u8(temp_8x8); + temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1); + temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1); + temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4); + temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4); + temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4)); + + mask_8x8 = vand_u8(mask_8x8, temp_8x8); + + return mask_8x8; +} + static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1, uint8x8_t p0q0) { const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1 @@ -523,6 +553,68 @@ static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0, } } +static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit, + const uint8_t limit, const uint8_t thresh) { + int32x2x2_t ps0_qs0, ps1_qs1; + int16x8_t filter_s16; + const uint8x8_t thresh_f4 = vdup_n_u8(thresh); + uint8x8_t mask_8x8, temp0_8x8, temp1_8x8; + int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8; + int8x8_t op0, oq0, op1, oq1; + int8x8_t pq_s0, pq_s1; + int8x8_t filter_s8, filter1_s8, filter2_s8; + int8x8_t hev_8x8; + const int8x8_t sign_mask = vdup_n_s8(0x80); + const int8x8_t val_4 = vdup_n_s8(4); + const int8x8_t val_3 = vdup_n_s8(3); + + // Calculate filter mask + mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit); + + pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask); + pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask); + + ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0)); + ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1)); + ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]); + qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]); + ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]); + qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]); + + // hev_mask + temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4); + temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8))); + hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8)); + + // add outer taps if we have high edge variance + filter_s8 = vqsub_s8(ps1_s8, qs1_s8); + filter_s8 = vand_s8(filter_s8, hev_8x8); + + // inner taps + temp_s8 = vqsub_s8(qs0_s8, ps0_s8); + filter_s16 = vmovl_s8(filter_s8); + filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3); + filter_s8 = vqmovn_s16(filter_s16); + filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8)); + + filter1_s8 = vqadd_s8(filter_s8, val_4); + filter2_s8 = vqadd_s8(filter_s8, val_3); + filter1_s8 = vshr_n_s8(filter1_s8, 3); + filter2_s8 = vshr_n_s8(filter2_s8, 3); + + oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask); + op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask); + + filter_s8 = vrshr_n_s8(filter1_s8, 1); + filter_s8 = vbic_s8(filter_s8, hev_8x8); + + oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask); + op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask); + + *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4)); + *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4)); +} + void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x16_t row0, row1, row2, row3; @@ -646,6 +738,125 @@ void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit, store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3); } +void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p2q2_p1q1, pxqy_p0q0; + uint32x2_t pq_rev; + uint8x8_t pxq0, p2q1, p1q2, p0qy; + uint8x8_t p0q0, p1q1, p2q2, pxqy; + + // row0: px p2 p1 p0 | q0 q1 q2 qy + // row1: px p2 p1 p0 | q0 q1 q2 qy + // row2: px p2 p1 p0 | q0 q1 q2 qy + // row3: px p2 p1 p0 | q0 q1 q2 qy + load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy); + + transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy)); + pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev); + + p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); + p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]); + + lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0)); + pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev); + + pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1)); + p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev); + + p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1])); + p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1])); + p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]); + pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]); + transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy); + + store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy); +} + +void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0; + uint32x2_t pq_rev; + uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), q0q1, p0q0, p1q1; + + // row0: p1 p0 | q0 q1 + // row1: p1 p0 | q0 q1 + // row2: p1 p0 | q0 q1 + // row3: p1 p0 | q0 q1 + load_u8_4x1(src - 2, &p1p0, 0); + load_u8_4x1((src - 2) + 1 * stride, &p1p0, 1); + load_u8_4x1((src - 2) + 2 * stride, &q0q1, 0); + load_u8_4x1((src - 2) + 3 * stride, &q0q1, 1); + + transpose_u8_4x4(&p1p0, &q0q1); + + p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1)); + + pq_rev = vrev64_u32(p1q0_p0q1.val[1]); + p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev); + + p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]); + p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]); + + lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); + + p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0)); + + p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]); + q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1])); + + transpose_u8_4x4(&p1p0, &q0q1); + + store_u8_4x1(src - 2, p1p0, 0); + store_u8_4x1((src - 2) + 1 * stride, q0q1, 0); + store_u8_4x1((src - 2) + 2 * stride, p1p0, 1); + store_u8_4x1((src - 2) + 3 * stride, q0q1, 1); +} + +void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6); + + load_u8_4x1(src - 7 * stride, &p6q6, 0); + load_u8_4x1(src - 6 * stride, &p5q5, 0); + load_u8_4x1(src - 5 * stride, &p4q4, 0); + load_u8_4x1(src - 4 * stride, &p3q3, 0); + load_u8_4x1(src - 3 * stride, &p2q2, 0); + load_u8_4x1(src - 2 * stride, &p1q1, 0); + load_u8_4x1(src - 1 * stride, &p0q0, 0); + load_u8_4x1(src + 0 * stride, &p0q0, 1); + load_u8_4x1(src + 1 * stride, &p1q1, 1); + load_u8_4x1(src + 2 * stride, &p2q2, 1); + load_u8_4x1(src + 3 * stride, &p3q3, 1); + load_u8_4x1(src + 4 * stride, &p4q4, 1); + load_u8_4x1(src + 5 * stride, &p5q5, 1); + load_u8_4x1(src + 6 * stride, &p6q6, 1); + + lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, + *thresh); + + store_u8_4x1(src - 6 * stride, p5q5, 0); + store_u8_4x1(src - 5 * stride, p4q4, 0); + store_u8_4x1(src - 4 * stride, p3q3, 0); + store_u8_4x1(src - 3 * stride, p2q2, 0); + store_u8_4x1(src - 2 * stride, p1q1, 0); + store_u8_4x1(src - 1 * stride, p0q0, 0); + store_u8_4x1(src + 0 * stride, p0q0, 1); + store_u8_4x1(src + 1 * stride, p1q1, 1); + store_u8_4x1(src + 2 * stride, p2q2, 1); + store_u8_4x1(src + 3 * stride, p3q3, 1); + store_u8_4x1(src + 4 * stride, p4q4, 1); + store_u8_4x1(src + 5 * stride, p5q5, 1); +} + void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { uint8x8_t p0q0, p1q1, p2q2, p3q3; @@ -698,3 +909,20 @@ void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit, vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1); vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1); } + +void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh) { + uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1); + + load_u8_4x1(src - 2 * stride, &p1q1, 0); + load_u8_4x1(src - 1 * stride, &p0q0, 0); + load_u8_4x1(src + 0 * stride, &p0q0, 1); + load_u8_4x1(src + 1 * stride, &p1q1, 1); + + lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); + + store_u8_4x1(src - 2 * stride, p1q1, 0); + store_u8_4x1(src - 1 * stride, p0q0, 0); + store_u8_4x1(src + 0 * stride, p0q0, 1); + store_u8_4x1(src + 1 * stride, p1q1, 1); +} diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c index 68fc381f2..02b5ef924 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.c +++ b/third_party/aom/aom_dsp/bitreader_buffer.c @@ -8,11 +8,14 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ + +#include + #include "config/aom_config.h" #include "aom_dsp/bitreader_buffer.h" -size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb) { +size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) { return (rb->bit_offset + 7) >> 3; } @@ -31,6 +34,7 @@ int aom_rb_read_bit(struct aom_read_bit_buffer *rb) { } int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { + assert(bits <= 31); int value = 0, bit; for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit; return value; @@ -38,6 +42,7 @@ int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) { uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits) { + assert(bits <= 32); uint32_t value = 0; int bit; for (bit = bits - 1; bit >= 0; bit--) diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h index 2dafe11ad..5c94ab883 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.h +++ b/third_party/aom/aom_dsp/bitreader_buffer.h @@ -31,7 +31,7 @@ struct aom_read_bit_buffer { aom_rb_error_handler error_handler; }; -size_t aom_rb_bytes_read(struct aom_read_bit_buffer *rb); +size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb); int aom_rb_read_bit(struct aom_read_bit_buffer *rb); diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c index 21314eb2a..a563bf684 100644 --- a/third_party/aom/aom_dsp/bitwriter_buffer.c +++ b/third_party/aom/aom_dsp/bitwriter_buffer.c @@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include #include #include @@ -49,12 +50,14 @@ void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) { } void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) { + assert(bits <= 31); int bit; for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); } void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb, uint32_t data, int bits) { + assert(bits <= 32); int bit; for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1); } diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c index fcb6c290e..ff1ec41a2 100644 --- a/third_party/aom/aom_dsp/grain_synthesis.c +++ b/third_party/aom/aom_dsp/grain_synthesis.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "aom_dsp/grain_synthesis.h" #include "aom_mem/aom_mem.h" @@ -237,7 +238,7 @@ static int grain_max; static uint16_t random_register = 0; // random number generator register -static void init_arrays(aom_film_grain_t *params, int luma_stride, +static void init_arrays(const aom_film_grain_t *params, int luma_stride, int chroma_stride, int ***pred_pos_luma_p, int ***pred_pos_chroma_p, int **luma_grain_block, int **cb_grain_block, int **cr_grain_block, @@ -331,7 +332,7 @@ static void init_arrays(aom_film_grain_t *params, int luma_stride, (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples); } -static void dealloc_arrays(aom_film_grain_t *params, int ***pred_pos_luma, +static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma, int ***pred_pos_chroma, int **luma_grain_block, int **cb_grain_block, int **cr_grain_block, int **y_line_buf, int **cb_line_buf, @@ -396,10 +397,14 @@ static void init_random_generator(int luma_line, uint16_t seed) { } static void generate_luma_grain_block( - aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, + const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, int luma_block_size_y, int luma_block_size_x, int luma_grain_stride, int left_pad, int top_pad, int right_pad, int bottom_pad) { - if (params->num_y_points == 0) return; + if (params->num_y_points == 0) { + memset(luma_grain_block, 0, + sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride); + return; + } int bit_depth = params->bit_depth; int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift; @@ -431,7 +436,7 @@ static void generate_luma_grain_block( } static void generate_chroma_grain_blocks( - aom_film_grain_t *params, + const aom_film_grain_t *params, // int** pred_pos_luma, int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block, int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y, @@ -443,7 +448,7 @@ static void generate_chroma_grain_blocks( int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1); if (params->num_y_points > 0) ++num_pos_chroma; int rounding_offset = (1 << (params->ar_coeff_shift - 1)); - int chroma_grain_samples = chroma_block_size_y * chroma_block_size_x; + int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride; if (params->num_cb_points || params->chroma_scaling_from_luma) { init_random_generator(7 << 5, params->random_seed); @@ -455,7 +460,8 @@ static void generate_chroma_grain_blocks( ((1 << gauss_sec_shift) >> 1)) >> gauss_sec_shift; } else { - memset(cr_grain_block, 0, sizeof(*cr_grain_block) * chroma_grain_samples); + memset(cb_grain_block, 0, + sizeof(*cb_grain_block) * chroma_grain_block_size); } if (params->num_cr_points || params->chroma_scaling_from_luma) { @@ -468,7 +474,8 @@ static void generate_chroma_grain_blocks( ((1 << gauss_sec_shift) >> 1)) >> gauss_sec_shift; } else { - memset(cb_grain_block, 0, sizeof(*cb_grain_block) * chroma_grain_samples); + memset(cr_grain_block, 0, + sizeof(*cr_grain_block) * chroma_grain_block_size); } for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++) @@ -522,7 +529,7 @@ static void generate_chroma_grain_blocks( } } -static void init_scaling_function(int scaling_points[][2], int num_points, +static void init_scaling_function(const int scaling_points[][2], int num_points, int scaling_lut[]) { if (num_points == 0) return; @@ -559,7 +566,7 @@ static int scale_LUT(int *scaling_lut, int index, int bit_depth) { (bit_depth - 8)); } -static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma, +static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma, uint8_t *cb, uint8_t *cr, int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain, int *cr_grain, @@ -675,7 +682,7 @@ static void add_noise_to_block(aom_film_grain_t *params, uint8_t *luma, } static void add_noise_to_block_hbd( - aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr, + const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr, int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain, int *cr_grain, int luma_grain_stride, int chroma_grain_stride, int half_luma_height, int half_luma_width, int bit_depth, @@ -903,7 +910,7 @@ static void hor_boundary_overlap(int *top_block, int top_stride, } } -void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src, +void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, aom_image_t *dst) { uint8_t *luma, *cb, *cr; int height, width, luma_stride, chroma_stride; @@ -950,6 +957,11 @@ void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src, exit(1); } + assert(params->bit_depth == src->bit_depth); + + dst->fmt = src->fmt; + dst->bit_depth = src->bit_depth; + dst->r_w = src->r_w; dst->r_h = src->r_h; dst->d_w = src->d_w; @@ -999,15 +1011,13 @@ void av1_add_film_grain(aom_film_grain_t *params, aom_image_t *src, luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth; chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth; - params->bit_depth = dst->bit_depth; - av1_add_film_grain_run(params, luma, cb, cr, height, width, luma_stride, chroma_stride, use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); return; } -void av1_add_film_grain_run(aom_film_grain_t *params, uint8_t *luma, +void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, uint8_t *cb, uint8_t *cr, int height, int width, int luma_stride, int chroma_stride, int use_high_bit_depth, int chroma_subsamp_y, diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h index 016cb12d7..65feb6068 100644 --- a/third_party/aom/aom_dsp/grain_synthesis.h +++ b/third_party/aom/aom_dsp/grain_synthesis.h @@ -72,7 +72,7 @@ typedef struct { int clip_to_restricted_range; - int bit_depth; // video bit depth + unsigned int bit_depth; // video bit depth int chroma_scaling_from_luma; @@ -94,7 +94,7 @@ typedef struct { * \param[in] luma_stride luma plane stride * \param[in] chroma_stride chroma plane stride */ -void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma, +void av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma, uint8_t *cb, uint8_t *cr, int height, int width, int luma_stride, int chroma_stride, int use_high_bit_depth, int chroma_subsamp_y, @@ -106,10 +106,10 @@ void av1_add_film_grain_run(aom_film_grain_t *grain_params, uint8_t *luma, * * \param[in] grain_params Grain parameters * \param[in] src Source image - * \param[in] dst Resulting image with grain + * \param[out] dst Resulting image with grain */ -void av1_add_film_grain(aom_film_grain_t *grain_params, aom_image_t *src, - aom_image_t *dst); +void av1_add_film_grain(const aom_film_grain_t *grain_params, + const aom_image_t *src, aom_image_t *dst); #ifdef __cplusplus } // extern "C" diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c index a1287f74f..5975c62e8 100644 --- a/third_party/aom/aom_dsp/noise_model.c +++ b/third_party/aom/aom_dsp/noise_model.c @@ -1458,3 +1458,189 @@ int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], } return init_success; } + +struct aom_denoise_and_model_t { + int block_size; + int bit_depth; + float noise_level; + + // Size of current denoised buffer and flat_block buffer + int width; + int height; + int y_stride; + int uv_stride; + int num_blocks_w; + int num_blocks_h; + + // Buffers for image and noise_psd allocated on the fly + float *noise_psd[3]; + uint8_t *denoised[3]; + uint8_t *flat_blocks; + + aom_flat_block_finder_t flat_block_finder; + aom_noise_model_t noise_model; +}; + +struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, + int block_size, + float noise_level) { + struct aom_denoise_and_model_t *ctx = + (struct aom_denoise_and_model_t *)aom_malloc( + sizeof(struct aom_denoise_and_model_t)); + if (!ctx) { + fprintf(stderr, "Unable to allocate denoise_and_model struct\n"); + return NULL; + } + memset(ctx, 0, sizeof(*ctx)); + + ctx->block_size = block_size; + ctx->noise_level = noise_level; + ctx->bit_depth = bit_depth; + + ctx->noise_psd[0] = + aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size); + ctx->noise_psd[1] = + aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size); + ctx->noise_psd[2] = + aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size); + if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) { + fprintf(stderr, "Unable to allocate noise PSD buffers\n"); + aom_denoise_and_model_free(ctx); + return NULL; + } + return ctx; +} + +void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) { + aom_free(ctx->flat_blocks); + for (int i = 0; i < 3; ++i) { + aom_free(ctx->denoised[i]); + aom_free(ctx->noise_psd[i]); + } + aom_noise_model_free(&ctx->noise_model); + aom_flat_block_finder_free(&ctx->flat_block_finder); + aom_free(ctx); +} + +static int denoise_and_model_realloc_if_necessary( + struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) { + if (ctx->width == sd->y_width && ctx->height == sd->y_height && + ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride) + return 1; + const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + const int block_size = ctx->block_size; + + ctx->width = sd->y_width; + ctx->height = sd->y_height; + ctx->y_stride = sd->y_stride; + ctx->uv_stride = sd->uv_stride; + + for (int i = 0; i < 3; ++i) { + aom_free(ctx->denoised[i]); + ctx->denoised[i] = NULL; + } + aom_free(ctx->flat_blocks); + ctx->flat_blocks = NULL; + + ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd); + ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); + ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd); + if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) { + fprintf(stderr, "Unable to allocate denoise buffers\n"); + return 0; + } + ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size; + ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size; + ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h); + + aom_flat_block_finder_free(&ctx->flat_block_finder); + if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size, + ctx->bit_depth, use_highbd)) { + fprintf(stderr, "Unable to init flat block finder\n"); + return 0; + } + + const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3, + ctx->bit_depth, use_highbd }; + aom_noise_model_free(&ctx->noise_model); + if (!aom_noise_model_init(&ctx->noise_model, params)) { + fprintf(stderr, "Unable to init noise model\n"); + return 0; + } + + // Simply use a flat PSD (although we could use the flat blocks to estimate + // PSD) those to estimate an actual noise PSD) + const float y_noise_level = + aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level); + const float uv_noise_level = aom_noise_psd_get_default_value( + ctx->block_size >> sd->subsampling_x, ctx->noise_level); + for (int i = 0; i < block_size * block_size; ++i) { + ctx->noise_psd[0][i] = y_noise_level; + ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level; + } + return 1; +} + +int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, + YV12_BUFFER_CONFIG *sd, + aom_film_grain_t *film_grain) { + const int block_size = ctx->block_size; + const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; + uint8_t *raw_data[3] = { + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer, + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer, + use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer, + }; + const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] }; + int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride }; + int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y }; + + if (!denoise_and_model_realloc_if_necessary(ctx, sd)) { + fprintf(stderr, "Unable to realloc buffers\n"); + return 0; + } + + aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width, + sd->y_height, strides[0], ctx->flat_blocks); + + if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height, + strides, chroma_sub_log2, ctx->noise_psd, + block_size, ctx->bit_depth, use_highbd)) { + fprintf(stderr, "Unable to denoise image\n"); + return 0; + } + + const aom_noise_status_t status = aom_noise_model_update( + &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised, + sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks, + block_size); + int have_noise_estimate = 0; + if (status == AOM_NOISE_STATUS_OK) { + have_noise_estimate = 1; + } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) { + aom_noise_model_save_latest(&ctx->noise_model); + have_noise_estimate = 1; + } else { + // Unable to update noise model; proceed if we have a previous estimate. + have_noise_estimate = + (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0); + } + + film_grain->apply_grain = 0; + if (have_noise_estimate) { + if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) { + fprintf(stderr, "Unable to get grain parameters.\n"); + return 0; + } + if (!film_grain->random_seed) { + film_grain->random_seed = 1071; + } + memcpy(raw_data[0], ctx->denoised[0], + (strides[0] * sd->y_height) << use_highbd); + memcpy(raw_data[1], ctx->denoised[1], + (strides[1] * sd->uv_height) << use_highbd); + memcpy(raw_data[2], ctx->denoised[2], + (strides[2] * sd->uv_height) << use_highbd); + } + return 1; +} diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h index dabeacc14..b07bf8617 100644 --- a/third_party/aom/aom_dsp/noise_model.h +++ b/third_party/aom/aom_dsp/noise_model.h @@ -18,6 +18,7 @@ extern "C" { #include #include "aom_dsp/grain_synthesis.h" +#include "aom_scale/yv12config.h" /*!\brief Wrapper of data required to represent linear system of eqns and soln. */ @@ -280,6 +281,42 @@ int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3], int w, int h, int stride[3], int chroma_sub_log2[2], float *noise_psd[3], int block_size, int bit_depth, int use_highbd); + +struct aom_denoise_and_model_t; + +/*!\brief Denoise the buffer and model the residual noise. + * + * This is meant to be called sequentially on input frames. The input buffer + * is denoised and the residual noise is modelled. The current noise estimate + * is populated in film_grain. Returns true on success. The grain.apply_grain + * parameter will be true when the input buffer was successfully denoised and + * grain was modelled. Returns false on error. + * + * \param[in] ctx Struct allocated with aom_denoise_and_model_alloc + * that holds some buffers for denoising and the current + * noise estimate. + * \param[in/out] buf The raw input buffer to be denoised. + * \param[out] grain Output film grain parameters + */ +int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, + YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain); + +/*!\brief Allocates a context that can be used for denoising and noise modeling. + * + * \param[in] bit_depth Bit depth of buffers this will be run on. + * \param[in] block_size Block size for noise modeling and flat block + * estimation + * \param[in] noise_level The noise_level (2.5 for moderate noise, and 5 for + * higher levels of noise) + */ +struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth, + int block_size, + float noise_level); + +/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc + */ +void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model); + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h index 0e5ae5b68..4b70cc57b 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h @@ -289,6 +289,15 @@ SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) { SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) { return c_v256_shr_s32(a, c); } +SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) { + return c_v256_shl_64(a, c); +} +SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) { + return c_v256_shr_u64(a, c); +} +SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) { + return c_v256_shr_s64(a, c); +} SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) { return c_v256_shr_n_byte(a, n); diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c index d367905bc..817ebe15d 100644 --- a/third_party/aom/aom_dsp/variance.c +++ b/third_party/aom/aom_dsp/variance.c @@ -386,7 +386,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, } } - const InterpFilterParams filter = + const InterpFilterParams *filter = av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { @@ -413,12 +413,12 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride, - temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), + aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, width, height); } @@ -974,7 +974,7 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, } } - const InterpFilterParams filter = + const InterpFilterParams *filter = av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { @@ -1004,14 +1004,14 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), + aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); aom_highbd_convolve8_vert( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, 16, width, height, bd); } @@ -1185,29 +1185,18 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, } } -void aom_highbd_comp_mask_upsampled_pred_c( +void aom_highbd_comp_mask_upsampled_pred( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, int bd) { - int i, j; - - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, bd); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; ++j) { - if (!invert_mask) - comp_pred[j] = AOM_BLEND_A64(mask[j], comp_pred[j], pred[j]); - else - comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], comp_pred[j]); - } - comp_pred += width; - pred += width; - mask += mask_stride; - } + aom_highbd_comp_mask_pred(comp_pred, pred8, width, height, + CONVERT_TO_BYTEPTR(comp_pred), width, mask, + mask_stride, invert_mask); } #define HIGHBD_MASK_SUBPIX_VAR(W, H) \ diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h index 544dda944..b954470de 100644 --- a/third_party/aom/aom_dsp/variance.h +++ b/third_party/aom/aom_dsp/variance.h @@ -76,6 +76,13 @@ void aom_comp_mask_upsampled_pred( int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); +void aom_highbd_comp_mask_upsampled_pred( + MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int bd); + typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, const int32_t *wsrc, const int32_t *msk); diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c index af45a03ac..f3fe50372 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -41,20 +41,290 @@ #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x) #endif // __clang__ +static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a)); + *((uint32_t *)(output_ptr + stride)) = + _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_storel_epi64((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { + __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); + a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); + return a; +} + +static INLINE void xx_store2_mi128(const uint8_t *output_ptr, + const ptrdiff_t stride, const __m256i *a) { + _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a)); + _mm_store_si128((__m128i *)(output_ptr + stride), + _mm256_extractf128_si256(*a, 1)); +} + +static void aom_filter_block1d4_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg; + __m256i firstFilters, secondFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 32 bits + firstFilters = _mm256_shuffle_epi32(filtersReg32, 0); + // duplicate only the second 32 bits + secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + // filter the source buffer + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + +static void aom_filter_block1d8_h8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); + + // add and saturate the results together + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); + + // filter the source buffer + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + static void aom_filter_block1d16_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; - __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; + __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); @@ -74,22 +344,17 @@ static void aom_filter_block1d16_h8_avx2( // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); - filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); + filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i -= 2) { // load the 2 strides of source - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg32b1 = _mm256_inserti128_si256( - srcReg32b1, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), - 1); + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); @@ -110,22 +375,13 @@ static void aom_filter_block1d16_h8_avx2( srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); + __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg32b2 = _mm256_inserti128_si256( - srcReg32b2, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), - 1); - - // add and saturate the results together - srcRegFilt32b1_1 = _mm256_adds_epi16( - srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); @@ -148,32 +404,21 @@ static void aom_filter_block1d16_h8_avx2( // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16( - srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); - - srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); + srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2)); - srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); - srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result + // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr += src_stride; - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, - _mm256_castsi256_si128(srcRegFilt32b1_1)); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + output_pitch), - _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); output_ptr += dst_stride; } @@ -183,7 +428,7 @@ static void aom_filter_block1d16_h8_avx2( __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); @@ -210,15 +455,11 @@ static void aom_filter_block1d16_h8_avx2( // add and saturate the results together srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); + _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); - - // add and saturate the results together - srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); @@ -245,19 +486,16 @@ static void aom_filter_block1d16_h8_avx2( // add and saturate the results together srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); + _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2)); + // shift by 6 bit each 16 bit srcRegFilt1_1 = - _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); srcRegFilt2_1 = - _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); + _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve @@ -269,11 +507,163 @@ static void aom_filter_block1d16_h8_avx2( } } +static void aom_filter_block1d8_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32; + __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; + __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; + __m256i srcReg32b11, srcReg32b12, filtersReg32; + __m256i firstFilters, secondFilters, thirdFilters, forthFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the first 16 bits (first and second byte) + // across 256 bit register + firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + // duplicate only the forth 16 bits (seventh and eighth byte) + // across 256 bit register + forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + // load 16 bytes 7 times in stride of src_pitch + srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); + srcReg32b7 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + + // have each consecutive loads on the same 256 register + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); + // merge every two consecutive registers except the last one + srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); + srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); + srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg32b8 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7))); + srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, + _mm256_castsi256_si128(srcReg32b8), 1); + srcReg32b9 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8))); + srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, + _mm256_castsi256_si128(srcReg32b9), 1); + + // merge every two consecutive registers + // save + srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); + srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); + + // multiply 2 adjacent elements with the filter and add the result + srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); + srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); + + // add and saturate the results together + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); + + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg32b10 = srcReg32b11; + srcReg32b11 = srcReg32b2; + srcReg32b2 = srcReg32b4; + srcReg32b7 = srcReg32b9; + } + if (i > 0) { + __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8; + // load the last 16 bytes + srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); + + // merge the last 2 results together + srcRegFilt4 = + _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), + _mm256_castsi256_si128(firstFilters)); + srcRegFilt4 = + _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); + + // add and saturate the results together + srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), + _mm256_castsi256_si128(secondFilters)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), + _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); + + // shift by 6 bit each 16 bit + srcRegFilt1 = + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1); + } +} + static void aom_filter_block1d16_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; - __m256i addFilterReg64; + __m256i addFilterReg32; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; @@ -281,11 +671,11 @@ static void aom_filter_block1d16_v8_avx2( unsigned int i; ptrdiff_t src_stride, dst_stride; - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); + addFilterReg32 = _mm256_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); @@ -308,49 +698,26 @@ static void aom_filter_block1d16_v8_avx2( dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch - srcReg32b1 = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr))); - srcReg32b2 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); - srcReg32b3 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); - srcReg32b4 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); - srcReg32b5 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); - srcReg32b6 = _mm256_castsi128_si256( - _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr); + srcReg32b3 = + xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg32b5 = + xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register - srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, - _mm256_castsi256_si128(srcReg32b2), 1); - srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, - _mm256_castsi256_si128(srcReg32b3), 1); - srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, - _mm256_castsi256_si128(srcReg32b4), 1); - srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, - _mm256_castsi256_si128(srcReg32b5), 1); - srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, - _mm256_castsi256_si128(srcReg32b6), 1); - srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, - _mm256_castsi256_si128(srcReg32b7), 1); - + srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21); + srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21); + srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); - - // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); - - // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); - - // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i -= 2) { @@ -383,9 +750,7 @@ static void aom_filter_block1d16_v8_avx2( // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); @@ -399,16 +764,13 @@ static void aom_filter_block1d16_v8_avx2( // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_min_epi16(srcReg32b8, srcReg32b12)); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, - _mm256_max_epi16(srcReg32b8, srcReg32b12)); - - srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); - srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); + _mm256_adds_epi16(srcReg32b8, srcReg32b12)); - // shift by 7 bit each 16 bit - srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); - srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); + // shift by 6 bit each 16 bit + srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32); + srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32); + srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6); + srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve @@ -417,12 +779,7 @@ static void aom_filter_block1d16_v8_avx2( src_ptr += src_stride; - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1)); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + out_pitch), - _mm256_extractf128_si256(srcReg32b1, 1)); + xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1); output_ptr += dst_stride; @@ -475,24 +832,17 @@ static void aom_filter_block1d16_v8_avx2( // add and saturate the results together srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); + _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); + _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7)); - // add and saturate the results together + // shift by 6 bit each 16 bit srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); + _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32)); srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); - - srcRegFilt1 = - _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); - srcRegFilt3 = - _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); + _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6); + srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve @@ -506,21 +856,6 @@ static void aom_filter_block1d16_v8_avx2( #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction aom_filter_block1d4_v8_ssse3; -#if ARCH_X86_64 -filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3; -#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3 -#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3 -#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3 -#else // ARCH_X86 -filter8_1dfunction aom_filter_block1d8_v8_ssse3; -filter8_1dfunction aom_filter_block1d8_h8_ssse3; -filter8_1dfunction aom_filter_block1d4_h8_ssse3; -#define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3 -#define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3 -#define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3 -#endif // ARCH_X86_64 filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; filter8_1dfunction aom_filter_block1d8_v2_ssse3; diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h index 7790baf2e..72fabd236 100644 --- a/third_party/aom/aom_dsp/x86/convolve_avx2.h +++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h @@ -13,31 +13,27 @@ #define AOM_DSP_X86_CONVOLVE_AVX2_H_ // filters for 16 -DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1, + 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5, + 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7, + 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 }; -DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, + 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, + 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, }; static INLINE void prepare_coeffs_lowbd( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *const filter = av1_get_interp_filter_subpel_kernel( - *filter_params, subpel_q4 & SUBPEL_MASK); + filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter); const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8); @@ -65,7 +61,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { const int16_t *filter = av1_get_interp_filter_subpel_kernel( - *filter_params, subpel_q4 & SUBPEL_MASK); + filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter); const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8); diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h index 846fe7bb4..399df5d6d 100644 --- a/third_party/aom/aom_dsp/x86/convolve_sse2.h +++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h @@ -19,7 +19,7 @@ static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params, const int subpel_q4, __m128i *const coeffs /* [4] */) { const int16_t *filter = av1_get_interp_filter_subpel_kernel( - *filter_params, subpel_q4 & SUBPEL_MASK); + filter_params, subpel_q4 & SUBPEL_MASK); const __m128i coeff = _mm_loadu_si128((__m128i *)filter); // coeffs 0 1 0 1 0 1 0 1 diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c index e5e3238d5..099fcf7fc 100644 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c @@ -105,8 +105,8 @@ void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride, void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { int i, j; @@ -254,8 +254,8 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride, void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { int i, j; diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c index f7ac9b496..e7b33d1c4 100644 --- a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c +++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c @@ -18,8 +18,8 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { @@ -166,8 +166,8 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, - InterpFilterParams *filter_params_x, - InterpFilterParams *filter_params_y, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index fdfadc886..131c16aa9 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -676,7 +676,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, } } - const InterpFilterParams filter = + const InterpFilterParams *filter = av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { @@ -726,14 +726,14 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter.taps >> 1) - 1), + aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1), ref_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd); aom_highbd_convolve8_vert( - CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1)), + CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, 16, width, height, bd); } diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c index 9801e285c..eaf1f347b 100644 --- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c +++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c @@ -22,118 +22,12 @@ void aom_var_filter_block2d_bil_first_pass_ssse3( const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { - // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow - // in computation using _mm_maddubs_epi16. - // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. - const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; - const __m128i r = _mm_set1_epi16(round); - const uint8_t f0 = filter[0] >> 1; - const uint8_t f1 = filter[1] >> 1; - const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, - f0, f1, f0, f1, f0, f1); - const __m128i shuffle_mask = - _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); - unsigned int i, j; - (void)pixel_step; - - if (output_width >= 8) { - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 8) { - // load source - __m128i source_low = xx_loadl_64(a); - __m128i source_hi = _mm_setzero_si128(); - - // avoid load undefined memory - if (a + 8 != NULL) source_hi = xx_loadl_64(a + 8); - __m128i source = _mm_unpacklo_epi64(source_low, source_hi); - - // shuffle to: - // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], - // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] - __m128i res = _mm_maddubs_epi16(source_shuffle, filters); - - // round - res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); - - xx_storeu_128(b, res); - - a += 8; - b += 8; - } - - a += src_pixels_per_line - output_width; - } - } else { - for (i = 0; i < output_height; ++i) { - // load source, only first 5 values are meaningful: - // { a[0], a[1], a[2], a[3], a[4], xxxx } - __m128i source = xx_loadl_64(a); - - // shuffle, up to the first 8 are useful - // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], - // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - __m128i res = _mm_maddubs_epi16(source_shuffle, filters); - res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); - - xx_storel_64(b, res); - - a += src_pixels_per_line; - b += output_width; - } - } -} + unsigned int output_width, const uint8_t *filter); void aom_var_filter_block2d_bil_second_pass_ssse3( const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, - unsigned int output_width, const uint8_t *filter) { - const int16_t round = (1 << FILTER_BITS) >> 1; - const __m128i r = _mm_set1_epi32(round); - const __m128i filters = - _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], - filter[1], filter[0], filter[1]); - const __m128i shuffle_mask = - _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); - const __m128i mask = - _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); - unsigned int i, j; - - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 4) { - // load source as: - // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } - __m128i source1 = xx_loadl_64(a); - __m128i source2 = xx_loadl_64(a + pixel_step); - __m128i source = _mm_unpacklo_epi64(source1, source2); - - // shuffle source to: - // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } - __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); - - // b[i] = a[i] * filter[0] + a[w + i] * filter[1] - __m128i res = _mm_madd_epi16(source_shuffle, filters); - - // round - res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); - - // shuffle to get each lower 8 bit of every 32 bit - res = _mm_shuffle_epi8(res, mask); - - xx_storel_32(b, res); - - a += 4; - b += 4; - } - - a += src_pixels_per_line - output_width; - } -} + unsigned int output_width, const uint8_t *filter); static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, const __m128i *w, const __m128i *r, diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c new file mode 100644 index 000000000..6538e4d5e --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/blend.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" + +static INLINE unsigned int masked_sad32xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 32) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return (sad + 31) >> 6; +} + +static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) { + __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo)); + __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi)); + __m256i a = _mm256_castsi128_si256(a0); + return _mm256_inserti128_si256(a, a1, 1); +} + +static INLINE unsigned int masked_sad16xh_avx2( + const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_scale = + _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr); + const __m256i m_inv = _mm256_sub_epi8(mask_max, m); + + // Calculate 16 predicted pixels. + // Note that the maximum value of any entry of 'pred_l' or 'pred_r' + // is 64 * 255, so we have plenty of space to add rounding constants. + const __m256i data_l = _mm256_unpacklo_epi8(a, b); + const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv); + __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l); + pred_l = _mm256_mulhrs_epi16(pred_l, round_scale); + + const __m256i data_r = _mm256_unpackhi_epi8(a, b); + const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv); + __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r); + pred_r = _mm256_mulhrs_epi16(pred_r, round_scale); + + const __m256i pred = _mm256_packus_epi16(pred_l, pred_r); + res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. + res = _mm256_shuffle_epi32(res, 0xd8); + res = _mm256_permute4x64_epi64(res, 0xd8); + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int32_t sad = _mm256_extract_epi32(res, 0); + return (sad + 31) >> 6; +} + +static INLINE unsigned int aom_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred, + m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 16: + sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define MASKSADMXN_AVX2(m, n) \ + unsigned int aom_masked_sad##m##x##n##_avx2( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ + int invert_mask) { \ + return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \ + msk, msk_stride, invert_mask, m, n); \ + } + +MASKSADMXN_AVX2(4, 4) +MASKSADMXN_AVX2(4, 8) +MASKSADMXN_AVX2(8, 4) +MASKSADMXN_AVX2(8, 8) +MASKSADMXN_AVX2(8, 16) +MASKSADMXN_AVX2(16, 8) +MASKSADMXN_AVX2(16, 16) +MASKSADMXN_AVX2(16, 32) +MASKSADMXN_AVX2(32, 16) +MASKSADMXN_AVX2(32, 32) +MASKSADMXN_AVX2(32, 64) +MASKSADMXN_AVX2(64, 32) +MASKSADMXN_AVX2(64, 64) +MASKSADMXN_AVX2(64, 128) +MASKSADMXN_AVX2(128, 64) +MASKSADMXN_AVX2(128, 128) +MASKSADMXN_AVX2(4, 16) +MASKSADMXN_AVX2(16, 4) +MASKSADMXN_AVX2(8, 32) +MASKSADMXN_AVX2(32, 8) +MASKSADMXN_AVX2(16, 64) +MASKSADMXN_AVX2(64, 16) + +static INLINE unsigned int highbd_masked_sad8xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y += 2) { + const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); + const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); + const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); + // Zero-extend mask to 16 bits + const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( + _mm_loadl_epi64((const __m128i *)(m_ptr)), + _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)))); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + + src_ptr += src_stride << 1; + a_ptr += a_stride << 1; + b_ptr += b_stride << 1; + m_ptr += m_stride << 1; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return (sad + 31) >> 6; +} + +static INLINE unsigned int highbd_masked_sad16xh_avx2( + const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, + int width, int height) { + const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); + const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); + const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); + int x, y; + __m256i res = _mm256_setzero_si256(); + const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i one = _mm256_set1_epi16(1); + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x += 16) { + const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]); + const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]); + const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]); + // Zero-extend mask to 16 bits + const __m256i m = + _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x])); + const __m256i m_inv = _mm256_sub_epi16(mask_max, m); + + const __m256i data_l = _mm256_unpacklo_epi16(a, b); + const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); + __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); + pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m256i data_r = _mm256_unpackhi_epi16(a, b); + const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); + __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); + pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), + AOM_BLEND_A64_ROUND_BITS); + + // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, + // so it is safe to do signed saturation here. + const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); + // There is no 16-bit SAD instruction, so we have to synthesize + // an 8-element SAD. We do this by storing 4 32-bit partial SADs, + // and accumulating them at the end + const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); + res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); + } + + src_ptr += src_stride; + a_ptr += a_stride; + b_ptr += b_stride; + m_ptr += m_stride; + } + // At this point, we have four 32-bit partial SADs stored in 'res'. + res = _mm256_hadd_epi32(res, res); + res = _mm256_hadd_epi32(res, res); + int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); + return (sad + 31) >> 6; +} + +static INLINE unsigned int aom_highbd_masked_sad_avx2( + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, + const uint8_t *second_pred, const uint8_t *msk, int msk_stride, + int invert_mask, int m, int n) { + unsigned int sad; + if (!invert_mask) { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride, + second_pred, m, msk, msk_stride, m, n); + break; + } + } else { + switch (m) { + case 4: + sad = + aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + case 8: + sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, n); + break; + default: + sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref, + ref_stride, msk, msk_stride, m, n); + break; + } + } + return sad; +} + +#define HIGHBD_MASKSADMXN_AVX2(m, n) \ + unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ + int msk_stride, int invert_mask) { \ + return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \ + second_pred8, msk, msk_stride, \ + invert_mask, m, n); \ + } + +HIGHBD_MASKSADMXN_AVX2(4, 4); +HIGHBD_MASKSADMXN_AVX2(4, 8); +HIGHBD_MASKSADMXN_AVX2(8, 4); +HIGHBD_MASKSADMXN_AVX2(8, 8); +HIGHBD_MASKSADMXN_AVX2(8, 16); +HIGHBD_MASKSADMXN_AVX2(16, 8); +HIGHBD_MASKSADMXN_AVX2(16, 16); +HIGHBD_MASKSADMXN_AVX2(16, 32); +HIGHBD_MASKSADMXN_AVX2(32, 16); +HIGHBD_MASKSADMXN_AVX2(32, 32); +HIGHBD_MASKSADMXN_AVX2(32, 64); +HIGHBD_MASKSADMXN_AVX2(64, 32); +HIGHBD_MASKSADMXN_AVX2(64, 64); +HIGHBD_MASKSADMXN_AVX2(64, 128); +HIGHBD_MASKSADMXN_AVX2(128, 64); +HIGHBD_MASKSADMXN_AVX2(128, 128); +HIGHBD_MASKSADMXN_AVX2(4, 16); +HIGHBD_MASKSADMXN_AVX2(16, 4); +HIGHBD_MASKSADMXN_AVX2(8, 32); +HIGHBD_MASKSADMXN_AVX2(32, 8); +HIGHBD_MASKSADMXN_AVX2(16, 64); +HIGHBD_MASKSADMXN_AVX2(64, 16); diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c index 1f42eec2f..493f9bd8f 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c @@ -19,6 +19,8 @@ #include "aom/aom_integer.h" #include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86//masked_sad_intrin_ssse3.h" + // For width a multiple of 16 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, @@ -27,16 +29,6 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, const uint8_t *m_ptr, int m_stride, int width, int height); -static INLINE unsigned int masked_sad8xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height); - -static INLINE unsigned int masked_sad4xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height); - #define MASKSADMXN_SSSE3(m, n) \ unsigned int aom_masked_sad##m##x##n##_ssse3( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ @@ -56,11 +48,11 @@ static INLINE unsigned int masked_sad4xh_ssse3( const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ - return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ - second_pred, 8, msk, msk_stride, n); \ + return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 8, msk, msk_stride, n); \ else \ - return masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ - ref_stride, msk, msk_stride, n); \ + return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ + ref_stride, msk, msk_stride, n); \ } #define MASKSAD4XN_SSSE3(n) \ @@ -69,11 +61,11 @@ static INLINE unsigned int masked_sad4xh_ssse3( const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ int invert_mask) { \ if (!invert_mask) \ - return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ - second_pred, 4, msk, msk_stride, n); \ + return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ + second_pred, 4, msk, msk_stride, n); \ else \ - return masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ - ref_stride, msk, msk_stride, n); \ + return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ + ref_stride, msk, msk_stride, n); \ } MASKSADMXN_SSSE3(128, 128) @@ -145,10 +137,11 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, return (sad + 31) >> 6; } -static INLINE unsigned int masked_sad8xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); @@ -189,10 +182,11 @@ static INLINE unsigned int masked_sad8xh_ssse3( return (sad + 31) >> 6; } -static INLINE unsigned int masked_sad4xh_ssse3( - const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, - const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); @@ -238,11 +232,6 @@ static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height); -static INLINE unsigned int highbd_masked_sad4xh_ssse3( - const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, - int height); - #define HIGHBD_MASKSADMXN_SSSE3(m, n) \ unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ const uint8_t *src8, int src_stride, const uint8_t *ref8, \ @@ -262,11 +251,13 @@ static INLINE unsigned int highbd_masked_sad4xh_ssse3( int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ int msk_stride, int invert_mask) { \ if (!invert_mask) \ - return highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, ref_stride, \ - second_pred8, 4, msk, msk_stride, n); \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \ + ref_stride, second_pred8, 4, msk, \ + msk_stride, n); \ else \ - return highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ - ref8, ref_stride, msk, msk_stride, n); \ + return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ + ref8, ref_stride, msk, msk_stride, \ + n); \ } HIGHBD_MASKSADMXN_SSSE3(128, 128) @@ -350,10 +341,11 @@ static INLINE unsigned int highbd_masked_sad_ssse3( return (sad + 31) >> 6; } -static INLINE unsigned int highbd_masked_sad4xh_ssse3( - const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, - int height) { +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h new file mode 100644 index 000000000..19b429d91 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H +#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H + +unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *a_ptr, int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, + const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height); + +#endif diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c new file mode 100644 index 000000000..2aa2a0555 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/synonyms.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + do { + const __m128i v_p_b_0 = xx_loadl_32(pre); + const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride); + const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int obmc_sad_w8n_avx2( + const uint8_t *pre, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_b = xx_loadl_64(pre + n); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if ((n & (width - 1)) == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define OBMCSADWXH(w, h) \ + unsigned int aom_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *msk) { \ + if (w == 4) { \ + return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \ + } else { \ + return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \ + } \ + } + +OBMCSADWXH(128, 128) +OBMCSADWXH(128, 64) +OBMCSADWXH(64, 128) +OBMCSADWXH(64, 64) +OBMCSADWXH(64, 32) +OBMCSADWXH(32, 64) +OBMCSADWXH(32, 32) +OBMCSADWXH(32, 16) +OBMCSADWXH(16, 32) +OBMCSADWXH(16, 16) +OBMCSADWXH(16, 8) +OBMCSADWXH(8, 16) +OBMCSADWXH(8, 8) +OBMCSADWXH(8, 4) +OBMCSADWXH(4, 8) +OBMCSADWXH(4, 4) +OBMCSADWXH(4, 16) +OBMCSADWXH(16, 4) +OBMCSADWXH(8, 32) +OBMCSADWXH(32, 8) +OBMCSADWXH(16, 64) +OBMCSADWXH(64, 16) + +//////////////////////////////////////////////////////////////////////////////// +// High bit-depth +//////////////////////////////////////////////////////////////////////////////// + +static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8, + const int pre_stride, + const int32_t *wsrc, + const int32_t *mask, + const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + do { + const __m128i v_p_w_0 = xx_loadl_64(pre); + const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride); + const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1); + const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d); + + const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d); + const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d); + + // Rounded absolute difference + + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d); + const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d); + + n += 8; + + pre += pre_stride << 1; + } while (n < 8 * (height >> 1)); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +static INLINE unsigned int hbd_obmc_sad_w8n_avx2( + const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, + const int32_t *mask, const int width, const int height) { + const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); + const int pre_step = pre_stride - width; + int n = 0; + __m256i v_sad_d = _mm256_setzero_si256(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + + assert(width >= 8); + assert(IS_POWER_OF_TWO(width)); + + do { + const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n)); + const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n)); + const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n)); + + const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d); + + // Rounded absolute difference + const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d); + const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12); + + v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d); + + n += 8; + + if (n % width == 0) pre += pre_step; + } while (n < width * height); + + __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d); + __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1); + v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1); + return xx_hsum_epi32_si32(v_sad_d_0); +} + +#define HBD_OBMCSADWXH(w, h) \ + unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask) { \ + if (w == 4) { \ + return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \ + } else { \ + return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \ + } \ + } + +HBD_OBMCSADWXH(128, 128) +HBD_OBMCSADWXH(128, 64) +HBD_OBMCSADWXH(64, 128) +HBD_OBMCSADWXH(64, 64) +HBD_OBMCSADWXH(64, 32) +HBD_OBMCSADWXH(32, 64) +HBD_OBMCSADWXH(32, 32) +HBD_OBMCSADWXH(32, 16) +HBD_OBMCSADWXH(16, 32) +HBD_OBMCSADWXH(16, 16) +HBD_OBMCSADWXH(16, 8) +HBD_OBMCSADWXH(8, 16) +HBD_OBMCSADWXH(8, 8) +HBD_OBMCSADWXH(8, 4) +HBD_OBMCSADWXH(4, 8) +HBD_OBMCSADWXH(4, 4) +HBD_OBMCSADWXH(4, 16) +HBD_OBMCSADWXH(16, 4) +HBD_OBMCSADWXH(8, 32) +HBD_OBMCSADWXH(32, 8) +HBD_OBMCSADWXH(16, 64) +HBD_OBMCSADWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 571aa770b..2e2f6e09f 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -26,6 +26,16 @@ // 8 bit //////////////////////////////////////////////////////////////////////////////// +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter); + static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, @@ -152,6 +162,46 @@ OBMCVARWXH(32, 8) OBMCVARWXH(16, 64) OBMCVARWXH(64, 16) +#include "config/aom_dsp_rtcd.h" + +#define OBMC_SUBPIX_VAR(W, H) \ + uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \ + } + +OBMC_SUBPIX_VAR(128, 128) +OBMC_SUBPIX_VAR(128, 64) +OBMC_SUBPIX_VAR(64, 128) +OBMC_SUBPIX_VAR(64, 64) +OBMC_SUBPIX_VAR(64, 32) +OBMC_SUBPIX_VAR(32, 64) +OBMC_SUBPIX_VAR(32, 32) +OBMC_SUBPIX_VAR(32, 16) +OBMC_SUBPIX_VAR(16, 32) +OBMC_SUBPIX_VAR(16, 16) +OBMC_SUBPIX_VAR(16, 8) +OBMC_SUBPIX_VAR(8, 16) +OBMC_SUBPIX_VAR(8, 8) +OBMC_SUBPIX_VAR(8, 4) +OBMC_SUBPIX_VAR(4, 8) +OBMC_SUBPIX_VAR(4, 4) +OBMC_SUBPIX_VAR(4, 16) +OBMC_SUBPIX_VAR(16, 4) +OBMC_SUBPIX_VAR(8, 32) +OBMC_SUBPIX_VAR(32, 8) +OBMC_SUBPIX_VAR(16, 64) +OBMC_SUBPIX_VAR(64, 16) + //////////////////////////////////////////////////////////////////////////////// // High bit-depth //////////////////////////////////////////////////////////////////////////////// diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c new file mode 100644 index 000000000..4389d123d --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include + +#include "config/aom_dsp_rtcd.h" + +static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr, + const uint8_t *pred_ptr) { + __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr)); + __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s)); + __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1)); + __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p)); + __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1)); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + const __m256i d_1 = _mm256_sub_epi16(s_1, p_1); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1); +} + +static INLINE void aom_subtract_block_16xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr)); + __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr)); + __m256i s_0 = _mm256_cvtepu8_epi16(s); + __m256i p_0 = _mm256_cvtepu8_epi16(p); + const __m256i d_0 = _mm256_sub_epi16(s_0, p_0); + _mm256_store_si256((__m256i *)(diff_ptr), d_0); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_32xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_64xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +static INLINE void aom_subtract_block_128xn_avx2( + int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) { + for (int32_t j = 0; j < rows; ++j) { + subtract32_avx2(diff_ptr, src_ptr, pred_ptr); + subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32); + subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64); + subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96); + src_ptr += src_stride; + pred_ptr += pred_stride; + diff_ptr += diff_stride; + } +} + +void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr, + ptrdiff_t diff_stride, const uint8_t *src_ptr, + ptrdiff_t src_stride, const uint8_t *pred_ptr, + ptrdiff_t pred_stride) { + switch (cols) { + case 16: + aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 32: + aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 64: + aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + case 128: + aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + default: + aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr, + src_stride, pred_ptr, pred_stride); + break; + } +} diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h new file mode 100644 index 000000000..bdff64b8f --- /dev/null +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_ + +#include +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output, + int8_t cos_bit); + +static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) { + return _mm256_set1_epi32( + (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16))); +} + +static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1, + __m256i *in0, __m256i *in1, const __m256i _r, + const int32_t cos_bit) { + __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); + __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); + __m256i u0 = _mm256_madd_epi16(t0, w0); + __m256i u1 = _mm256_madd_epi16(t1, w0); + __m256i v0 = _mm256_madd_epi16(t0, w1); + __m256i v1 = _mm256_madd_epi16(t1, w1); + + __m256i a0 = _mm256_add_epi32(u0, _r); + __m256i a1 = _mm256_add_epi32(u1, _r); + __m256i b0 = _mm256_add_epi32(v0, _r); + __m256i b1 = _mm256_add_epi32(v1, _r); + + __m256i c0 = _mm256_srai_epi32(a0, cos_bit); + __m256i c1 = _mm256_srai_epi32(a1, cos_bit); + __m256i d0 = _mm256_srai_epi32(b0, cos_bit); + __m256i d1 = _mm256_srai_epi32(b1, cos_bit); + + *in0 = _mm256_packs_epi32(c0, c1); + *in1 = _mm256_packs_epi32(d0, d1); +} + +static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_adds_epi16(_in0, _in1); + *in1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) { + const __m256i _in0 = *in0; + const __m256i _in1 = *in1; + *in0 = _mm256_add_epi32(_in0, _in1); + *in1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_adds_epi16(_in0, _in1); + *out1 = _mm256_subs_epi16(_in0, _in1); +} + +static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1, + __m256i in0, __m256i in1) { + const __m256i _in0 = in0; + const __m256i _in1 = in1; + *out0 = _mm256_add_epi32(_in0, _in1); + *out1 = _mm256_sub_epi32(_in0, _in1); +} + +static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) { + return _mm256_load_si256((const __m256i *)a); +} + +static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in, + int stride, + __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride); + } +} + +static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) { + const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a); + const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); + return _mm256_permute4x64_epi64(b, 0xD8); +} + +static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in, + int stride, __m256i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride); + } +} + +static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, + __m256i *const out) { + // Unpack 16 bit elements. Goes from: + // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f + // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f + // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f + // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f + // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f + // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f + // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f + // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f + // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f + // to: + // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + // ... + __m256i a[16]; + for (int i = 0; i < 16; i += 2) { + a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]); + a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]); + } + __m256i b[16]; + for (int i = 0; i < 16; i += 2) { + b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]); + b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]); + } + __m256i c[16]; + for (int i = 0; i < 16; i += 2) { + c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]); + c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]); + } + out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20); + out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20); + out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20); + out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20); + + out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31); + out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31); + out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31); + out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31); + + out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20); + out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20); + out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20); + out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20); + + out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31); + out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31); + out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31); + out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31); +} + +static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + +static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { + if (bit < 0) { + bit = -bit; + __m256i round = _mm256_set1_epi16(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[i] = _mm256_adds_epi16(in[i], round); + in[i] = _mm256_srai_epi16(in[i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[i] = _mm256_slli_epi16(in[i], bit); + } + } +} + +#ifdef __cplusplus +} +#endif + +#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c index 7d6b7d287..a7ac2c93d 100644 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -324,6 +324,12 @@ static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) { return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); } +static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) { + const __m256i d = + _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); + return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); +} + static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1, const __m256i a, uint8_t *comp_pred) { @@ -401,3 +407,110 @@ void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width, } while (i < height); } } + +static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, + const __m256i s1, + const __m256i a) { + const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m256i round_const = + _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m256i a_inv = _mm256_sub_epi16(alpha_max, a); + + const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1); + const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv); + const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo); + const __m256i pred_l = _mm256_srai_epi32( + _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1); + const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv); + const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi); + const __m256i pred_h = _mm256_srai_epi32( + _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS); + + const __m256i comp = _mm256_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m256i zero = _mm256_setzero_si256(); + + if (width == 8) { + do { + const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); + const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); + + const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); + + __m256i m = _mm256_castsi128_si256(m_l); + m = _mm256_insertf128_si256(m, m_h, 1); + const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); + + _mm_storeu_si128((__m128i *)(comp_pred + width), + _mm256_extractf128_si256(comp, 1)); + + src0 += (stride0 << 1); + src1 += (stride1 << 1); + mask += (mask_stride << 1); + comp_pred += (width << 1); + i += 2; + } while (i < height); + } else if (width == 16) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); + const __m256i m_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 32) { + do { + const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0); + const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16)); + const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1); + const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16)); + + const __m256i m01_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); + const __m256i m23_16 = + _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16))); + + const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); + const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); + + _mm256_storeu_si256((__m256i *)comp_pred, comp); + _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } +} diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c new file mode 100644 index 000000000..66b0d7d84 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/x86/synonyms.h" + +void aom_var_filter_block2d_bil_first_pass_ssse3( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow + // in computation using _mm_maddubs_epi16. + // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow. + const int16_t round = (1 << (FILTER_BITS - 1)) >> 1; + const __m128i r = _mm_set1_epi16(round); + const uint8_t f0 = filter[0] >> 1; + const uint8_t f1 = filter[1] >> 1; + const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1, + f0, f1, f0, f1, f0, f1); + unsigned int i, j; + (void)pixel_step; + + if (output_width >= 8) { + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 8) { + // load source + __m128i source_low = xx_loadl_64(a); + __m128i source_hi = xx_loadl_64(a + 1); + + // unpack to: + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source = _mm_unpacklo_epi8(source_low, source_hi); + + // b[i] = a[i] * filter[0] + a[i + 1] * filter[1] + __m128i res = _mm_maddubs_epi16(source, filters); + + // round + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storeu_128(b, res); + + a += 8; + b += 8; + } + + a += src_pixels_per_line - output_width; + } + } else { + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8); + for (i = 0; i < output_height; ++i) { + // load source, only first 5 values are meaningful: + // { a[0], a[1], a[2], a[3], a[4], xxxx } + __m128i source = xx_loadl_64(a); + + // shuffle, up to the first 8 are useful + // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4], + // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + __m128i res = _mm_maddubs_epi16(source_shuffle, filters); + res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1); + + xx_storel_64(b, res); + + a += src_pixels_per_line; + b += output_width; + } + } +} + +void aom_var_filter_block2d_bil_second_pass_ssse3( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { + const int16_t round = (1 << FILTER_BITS) >> 1; + const __m128i r = _mm_set1_epi32(round); + const __m128i filters = + _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0], + filter[1], filter[0], filter[1]); + const __m128i shuffle_mask = + _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + const __m128i mask = + _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1); + unsigned int i, j; + + for (i = 0; i < output_height; ++i) { + for (j = 0; j < output_width; j += 4) { + // load source as: + // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] } + __m128i source1 = xx_loadl_64(a); + __m128i source2 = xx_loadl_64(a + pixel_step); + __m128i source = _mm_unpacklo_epi64(source1, source2); + + // shuffle source to: + // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] } + __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask); + + // b[i] = a[i] * filter[0] + a[w + i] * filter[1] + __m128i res = _mm_madd_epi16(source_shuffle, filters); + + // round + res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS); + + // shuffle to get each lower 8 bit of every 32 bit + res = _mm_shuffle_epi8(res, mask); + + xx_storel_32(b, res); + + a += 4; + b += 4; + } + + a += src_pixels_per_line - output_width; + } +} diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index c8c90a7dc..7e3c5d5db 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -569,7 +569,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, } } - const InterpFilterParams filter = + const InterpFilterParams *filter = av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { @@ -633,12 +633,12 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter.taps; + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter.taps >> 1) - 1), ref_stride, - temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, - intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter.taps >> 1) - 1), + aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, width, height); } -- cgit v1.2.3 From d2499ead93dc4298c0882fe98902acb1b5209f99 Mon Sep 17 00:00:00 2001 From: trav90 Date: Fri, 19 Oct 2018 23:05:00 -0500 Subject: Update libaom to commit ID 1e227d41f0616de9548a673a83a21ef990b62591 --- third_party/aom/aom_dsp/aom_convolve.c | 1 - third_party/aom/aom_dsp/aom_convolve.h | 56 - third_party/aom/aom_dsp/aom_dsp.cmake | 30 +- third_party/aom/aom_dsp/aom_dsp_common.h | 6 +- third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl | 118 +- third_party/aom/aom_dsp/aom_filter.h | 6 +- third_party/aom/aom_dsp/aom_simd.h | 6 +- third_party/aom/aom_dsp/aom_simd_inline.h | 6 +- third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c | 6 +- third_party/aom/aom_dsp/arm/subpel_variance_neon.c | 22 +- third_party/aom/aom_dsp/binary_codes_reader.c | 14 +- third_party/aom/aom_dsp/binary_codes_reader.h | 8 +- third_party/aom/aom_dsp/binary_codes_writer.c | 18 +- third_party/aom/aom_dsp/binary_codes_writer.h | 7 +- third_party/aom/aom_dsp/bitreader.h | 12 +- third_party/aom/aom_dsp/bitreader_buffer.c | 10 + third_party/aom/aom_dsp/bitreader_buffer.h | 8 +- third_party/aom/aom_dsp/bitwriter.h | 6 +- third_party/aom/aom_dsp/bitwriter_buffer.c | 12 + third_party/aom/aom_dsp/bitwriter_buffer.h | 8 +- third_party/aom/aom_dsp/blend.h | 6 +- third_party/aom/aom_dsp/buf_ans.h | 6 +- third_party/aom/aom_dsp/daalaboolreader.c | 6 + third_party/aom/aom_dsp/daalaboolreader.h | 9 +- third_party/aom/aom_dsp/daalaboolwriter.h | 6 +- third_party/aom/aom_dsp/entcode.h | 6 +- third_party/aom/aom_dsp/entdec.c | 1 + third_party/aom/aom_dsp/entdec.h | 6 +- third_party/aom/aom_dsp/entenc.c | 1 + third_party/aom/aom_dsp/entenc.h | 6 +- third_party/aom/aom_dsp/fft_common.h | 6 +- third_party/aom/aom_dsp/grain_synthesis.c | 67 +- third_party/aom/aom_dsp/grain_synthesis.h | 24 +- third_party/aom/aom_dsp/grain_table.h | 2 +- third_party/aom/aom_dsp/intrapred_common.h | 6 +- third_party/aom/aom_dsp/mips/aom_convolve_msa.h | 6 +- third_party/aom/aom_dsp/mips/common_dspr2.h | 6 +- .../aom/aom_dsp/mips/convolve2_horiz_dspr2.c | 1 - .../aom/aom_dsp/mips/convolve2_vert_dspr2.c | 1 - .../aom/aom_dsp/mips/convolve_common_dspr2.h | 6 +- .../aom/aom_dsp/mips/loopfilter_filters_dspr2.h | 6 +- .../aom/aom_dsp/mips/loopfilter_macros_dspr2.h | 6 +- .../aom/aom_dsp/mips/loopfilter_masks_dspr2.h | 6 +- third_party/aom/aom_dsp/mips/loopfilter_msa.h | 6 +- third_party/aom/aom_dsp/mips/macros_msa.h | 6 +- .../aom/aom_dsp/mips/sub_pixel_variance_msa.c | 22 +- third_party/aom/aom_dsp/noise_model.c | 4 +- third_party/aom/aom_dsp/noise_model.h | 6 +- third_party/aom/aom_dsp/noise_util.h | 6 +- third_party/aom/aom_dsp/postproc.h | 6 +- third_party/aom/aom_dsp/prob.h | 6 +- third_party/aom/aom_dsp/psnr.c | 1 - third_party/aom/aom_dsp/psnr.h | 6 +- third_party/aom/aom_dsp/quantize.c | 217 ++-- third_party/aom/aom_dsp/quantize.h | 29 +- third_party/aom/aom_dsp/sad.c | 7 +- third_party/aom/aom_dsp/simd/v128_intrinsics.h | 6 +- third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h | 9 +- third_party/aom/aom_dsp/simd/v128_intrinsics_c.h | 6 +- third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h | 6 +- third_party/aom/aom_dsp/simd/v256_intrinsics.h | 6 +- third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h | 6 +- third_party/aom/aom_dsp/simd/v256_intrinsics_c.h | 6 +- .../aom/aom_dsp/simd/v256_intrinsics_v128.h | 6 +- third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h | 6 +- third_party/aom/aom_dsp/simd/v64_intrinsics.h | 6 +- third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h | 6 +- third_party/aom/aom_dsp/simd/v64_intrinsics_c.h | 6 +- third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h | 6 +- third_party/aom/aom_dsp/sse.c | 52 + third_party/aom/aom_dsp/ssim.c | 2 +- third_party/aom/aom_dsp/ssim.h | 6 +- third_party/aom/aom_dsp/txfm_common.h | 6 +- third_party/aom/aom_dsp/variance.c | 399 +++--- third_party/aom/aom_dsp/variance.h | 16 +- third_party/aom/aom_dsp/x86/aom_asm_stubs.c | 7 + .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c | 556 +++++++++ .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c | 7 + third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c | 900 ++++++++++++++ third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c | 514 ++++---- third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c | 8 +- third_party/aom/aom_dsp/x86/blend_mask_sse4.h | 237 ++++ third_party/aom/aom_dsp/x86/blend_sse4.h | 69 +- third_party/aom/aom_dsp/x86/common_avx2.h | 6 +- third_party/aom/aom_dsp/x86/convolve.h | 32 +- third_party/aom/aom_dsp/x86/convolve_avx2.h | 11 +- .../aom/aom_dsp/x86/convolve_common_intrin.h | 6 +- third_party/aom/aom_dsp/x86/convolve_sse2.h | 6 +- third_party/aom/aom_dsp/x86/convolve_sse4_1.h | 6 +- third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h | 6 +- .../aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm | 351 ------ .../aom/aom_dsp/x86/halfpix_variance_sse2.c | 78 -- .../aom/aom_dsp/x86/highbd_loopfilter_sse2.c | 707 ++++++----- .../aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c | 70 +- .../aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c | 149 ++- third_party/aom/aom_dsp/x86/highbd_variance_avx2.c | 140 +++ third_party/aom/aom_dsp/x86/highbd_variance_sse2.c | 72 +- third_party/aom/aom_dsp/x86/highbd_variance_sse4.c | 12 +- third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c | 4 +- third_party/aom/aom_dsp/x86/loopfilter_avx2.c | 916 -------------- third_party/aom/aom_dsp/x86/loopfilter_sse2.c | 1275 +++++++++++++++----- third_party/aom/aom_dsp/x86/lpf_common_sse2.h | 6 +- .../aom/aom_dsp/x86/masked_sad_intrin_avx2.c | 1 - .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.h | 6 +- .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.h | 6 +- third_party/aom/aom_dsp/x86/mem_sse2.h | 6 +- third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h | 58 + third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h | 15 +- third_party/aom/aom_dsp/x86/obmc_variance_avx2.c | 190 +++ third_party/aom/aom_dsp/x86/obmc_variance_sse4.c | 41 +- .../aom/aom_dsp/x86/quantize_avx_x86_64.asm | 67 +- third_party/aom/aom_dsp/x86/quantize_sse2.c | 329 ++--- .../aom/aom_dsp/x86/quantize_ssse3_x86_64.asm | 44 +- third_party/aom/aom_dsp/x86/quantize_x86.h | 77 ++ third_party/aom/aom_dsp/x86/sse_avx2.c | 250 ++++ third_party/aom/aom_dsp/x86/sse_sse4.c | 241 ++++ third_party/aom/aom_dsp/x86/sum_squares_avx2.c | 79 ++ third_party/aom/aom_dsp/x86/sum_squares_sse2.c | 10 +- third_party/aom/aom_dsp/x86/sum_squares_sse2.h | 22 + third_party/aom/aom_dsp/x86/synonyms.h | 15 +- third_party/aom/aom_dsp/x86/synonyms_avx2.h | 16 +- third_party/aom/aom_dsp/x86/transpose_sse2.h | 6 +- third_party/aom/aom_dsp/x86/txfm_common_avx2.h | 6 +- third_party/aom/aom_dsp/x86/txfm_common_sse2.h | 6 +- third_party/aom/aom_dsp/x86/variance_avx2.c | 3 +- third_party/aom/aom_dsp/x86/variance_sse2.c | 166 ++- 126 files changed, 5796 insertions(+), 3455 deletions(-) delete mode 100644 third_party/aom/aom_dsp/aom_convolve.h create mode 100644 third_party/aom/aom_dsp/sse.c create mode 100644 third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/blend_mask_sse4.h delete mode 100644 third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm delete mode 100644 third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c create mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_avx2.c delete mode 100644 third_party/aom/aom_dsp/x86/loopfilter_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h create mode 100644 third_party/aom/aom_dsp/x86/obmc_variance_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/quantize_x86.h create mode 100644 third_party/aom/aom_dsp/x86/sse_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sse_sse4.c create mode 100644 third_party/aom/aom_dsp/x86/sum_squares_avx2.c create mode 100644 third_party/aom/aom_dsp/x86/sum_squares_sse2.h (limited to 'third_party/aom/aom_dsp') diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c index bba37e227..4791826da 100644 --- a/third_party/aom/aom_dsp/aom_convolve.c +++ b/third_party/aom/aom_dsp/aom_convolve.c @@ -16,7 +16,6 @@ #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/aom_convolve.h b/third_party/aom/aom_dsp/aom_convolve.h deleted file mode 100644 index 6f5b888e4..000000000 --- a/third_party/aom/aom_dsp/aom_convolve.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ -#ifndef AOM_DSP_AOM_CONVOLVE_H_ -#define AOM_DSP_AOM_CONVOLVE_H_ - -#include "config/aom_config.h" - -#include "aom/aom_integer.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// Note: Fixed size intermediate buffers, place limits on parameters -// of some functions. 2d filtering proceeds in 2 steps: -// (1) Interpolate horizontally into an intermediate buffer, temp. -// (2) Interpolate temp vertically to derive the sub-pixel result. -// Deriving the maximum number of rows in the temp buffer (135): -// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). -// --Largest block size is 64x64 pixels. -// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the -// original frame (in 1/16th pixel units). -// --Must round-up because block may be located at sub-pixel position. -// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. -// --((64 - 1) * 32 + 15) >> 4 + 8 = 135. -// TODO(wtc): Update the above comment to explain the value 263 used in aom. -#define MAX_EXT_SIZE 263 - -#define EXTRAPREC_BITS 2 -#define EXTRAPREC_CLAMP_LIMIT(bd) (1 << ((bd) + 1 + EXTRAPREC_BITS)) - -typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); - -typedef void (*highbd_convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h, int bd); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // AOM_DSP_AOM_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake index 7c0111a69..11ff73756 100644 --- a/third_party/aom/aom_dsp/aom_dsp.cmake +++ b/third_party/aom/aom_dsp/aom_dsp.cmake @@ -15,11 +15,14 @@ set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1) list(APPEND AOM_DSP_COMMON_SOURCES "${AOM_ROOT}/aom_dsp/aom_convolve.c" - "${AOM_ROOT}/aom_dsp/aom_convolve.h" "${AOM_ROOT}/aom_dsp/aom_dsp_common.h" "${AOM_ROOT}/aom_dsp/aom_filter.h" "${AOM_ROOT}/aom_dsp/aom_simd.h" "${AOM_ROOT}/aom_dsp/aom_simd_inline.h" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" + "${AOM_ROOT}/aom_dsp/bitreader_buffer.h" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" + "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" "${AOM_ROOT}/aom_dsp/blend.h" "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c" "${AOM_ROOT}/aom_dsp/blend_a64_mask.c" @@ -64,7 +67,8 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h" "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h" "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h") + "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h") list(APPEND AOM_DSP_COMMON_ASM_SSSE3 "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm" @@ -76,6 +80,7 @@ list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c") list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1 + "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h" "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c" "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c" "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c") @@ -88,7 +93,8 @@ list(APPEND AOM_DSP_COMMON_INTRIN_AVX2 "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c") + "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c") list(APPEND AOM_DSP_COMMON_INTRIN_NEON "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c" @@ -125,12 +131,9 @@ if(CONFIG_AV1_DECODER) "${AOM_ROOT}/aom_dsp/binary_codes_reader.c" "${AOM_ROOT}/aom_dsp/binary_codes_reader.h" "${AOM_ROOT}/aom_dsp/bitreader.h" - "${AOM_ROOT}/aom_dsp/bitreader_buffer.c" - "${AOM_ROOT}/aom_dsp/bitreader_buffer.h" "${AOM_ROOT}/aom_dsp/daalaboolreader.c" "${AOM_ROOT}/aom_dsp/daalaboolreader.h" - "${AOM_ROOT}/aom_dsp/entdec.c" - "${AOM_ROOT}/aom_dsp/entdec.h" + "${AOM_ROOT}/aom_dsp/entdec.c" "${AOM_ROOT}/aom_dsp/entdec.h" "${AOM_ROOT}/aom_dsp/grain_synthesis.c" "${AOM_ROOT}/aom_dsp/grain_synthesis.h") endif() @@ -140,8 +143,6 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/binary_codes_writer.c" "${AOM_ROOT}/aom_dsp/binary_codes_writer.h" "${AOM_ROOT}/aom_dsp/bitwriter.h" - "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c" - "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h" "${AOM_ROOT}/aom_dsp/daalaboolwriter.c" "${AOM_ROOT}/aom_dsp/daalaboolwriter.h" "${AOM_ROOT}/aom_dsp/entenc.c" @@ -158,13 +159,13 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/quantize.c" "${AOM_ROOT}/aom_dsp/quantize.h" "${AOM_ROOT}/aom_dsp/sad.c" + "${AOM_ROOT}/aom_dsp/sse.c" "${AOM_ROOT}/aom_dsp/sad_av1.c" "${AOM_ROOT}/aom_dsp/sum_squares.c" "${AOM_ROOT}/aom_dsp/variance.c" "${AOM_ROOT}/aom_dsp/variance.h") list(APPEND AOM_DSP_ENCODER_ASM_SSE2 - "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_impl_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm" @@ -178,11 +179,11 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h" "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c" "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h" - "${AOM_ROOT}/aom_dsp/x86/halfpix_variance_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c" "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h" "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c" "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c") @@ -199,8 +200,12 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c" "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c" "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c" "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c" - "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c") + "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c" + "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c") list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") @@ -219,6 +224,7 @@ if(CONFIG_AV1_ENCODER) list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c" + "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c" "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c" "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c") diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h index c5dc9a834..a185b23c8 100644 --- a/third_party/aom/aom_dsp/aom_dsp_common.h +++ b/third_party/aom/aom_dsp/aom_dsp_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_AOM_DSP_COMMON_H_ -#define AOM_DSP_AOM_DSP_COMMON_H_ +#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_ +#define AOM_AOM_DSP_AOM_DSP_COMMON_H_ #include "config/aom_config.h" @@ -95,4 +95,4 @@ static INLINE unsigned int negative_to_zero(int value) { } // extern "C" #endif -#endif // AOM_DSP_AOM_DSP_COMMON_H_ +#endif // AOM_AOM_DSP_AOM_DSP_COMMON_H_ diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl index 1a9ac3184..8e8a480fe 100755 --- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -519,23 +519,23 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){ # Quantization # if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64"; - add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64"; - add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; } # CONFIG_AV1_ENCODER if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { - add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b sse2 avx2/; - add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_highbd_quantize_b_32x32 sse2/; - add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; } # CONFIG_AV1_ENCODER @@ -543,12 +543,12 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # Alpha blending with mask # add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params"; -specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 neon/; +specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/; add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd"; add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby"; add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h"; -specialize "aom_blend_a64_mask", qw/sse4_1/; +specialize "aom_blend_a64_mask", qw/sse4_1 avx2/; specialize "aom_blend_a64_hmask", qw/sse4_1 neon/; specialize "aom_blend_a64_vmask", qw/sse4_1 neon/; @@ -569,15 +569,22 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd"; specialize qw/aom_highbd_subtract_block sse2/; + add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height"; + specialize qw/aom_sse sse4_1 avx2/; + + add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height"; + specialize qw/aom_highbd_sse sse4_1 avx2/; + if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # # Sum of Squares # add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; - specialize qw/aom_sum_squares_2d_i16 sse2/; + specialize qw/aom_sum_squares_2d_i16 sse2 avx2/; add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; specialize qw/aom_sum_squares_i16 sse2/; + } @@ -830,7 +837,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_highbd_sad16x64x4d sse2/; specialize qw/aom_highbd_sad64x16x4d sse2/; - # # Structured Similarity (SSIM) # @@ -888,36 +894,43 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref, int ref_stride"; + int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search"; specialize qw/aom_upsampled_pred sse2/; add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride"; + int ref_stride, int subpel_search"; specialize qw/aom_comp_avg_upsampled_pred sse2/; add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param"; + int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search"; specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/; + add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int subpel_search"; + specialize qw/aom_comp_mask_upsampled_pred sse2/; + add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd"; + const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search"; specialize qw/aom_highbd_upsampled_pred sse2/; add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd"; + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search"; specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param"; + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search"; specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/; @@ -1101,7 +1114,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse"; - specialize "aom_obmc_variance${w}x${h}", q/sse4_1/; + specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/; specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/; } @@ -1154,17 +1167,6 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/; - # - # Specialty Subpixel - # - add_proto qw/uint32_t aom_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_variance_halfpixvar16x16_h sse2/; - - add_proto qw/uint32_t aom_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_variance_halfpixvar16x16_v sse2/; - - add_proto qw/uint32_t aom_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/aom_variance_halfpixvar16x16_hv sse2/; # # Comp Avg @@ -1174,6 +1176,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; specialize qw/aom_jnt_comp_avg_pred ssse3/; + add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance128x128 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance128x64 sse2/; + + add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_12_variance64x128 sse2/; add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/aom_highbd_12_variance64x64 sse2/; @@ -1209,40 +1219,58 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance128x128 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance128x64 sse2 avx2/; + + add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_10_variance64x128 sse2 avx2/; + add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x64 sse2/; + specialize qw/aom_highbd_10_variance64x64 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance64x32 sse2/; + specialize qw/aom_highbd_10_variance64x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x64 sse2/; + specialize qw/aom_highbd_10_variance32x64 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x32 sse2/; + specialize qw/aom_highbd_10_variance32x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance32x16 sse2/; + specialize qw/aom_highbd_10_variance32x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x32 sse2/; + specialize qw/aom_highbd_10_variance16x32 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x16 sse2/; + specialize qw/aom_highbd_10_variance16x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance16x8 sse2/; + specialize qw/aom_highbd_10_variance16x8 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance8x16 sse2/; + specialize qw/aom_highbd_10_variance8x16 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/aom_highbd_10_variance8x8 sse2/; + specialize qw/aom_highbd_10_variance8x8 sse2 avx2/; add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance128x128 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance128x64 sse2/; + + add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; + specialize qw/aom_highbd_8_variance64x128 sse2/; + add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/aom_highbd_8_variance64x64 sse2/; @@ -1310,9 +1338,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/aom_highbd_12_mse8x8 sse2/; - add_proto qw/void aom_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; + add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; - add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; + add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; specialize qw/aom_highbd_jnt_comp_avg_pred sse2/; # @@ -1539,8 +1567,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; specialize qw/aom_comp_mask_pred ssse3 avx2/; - add_proto qw/void aom_highbd_comp_mask_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; - specialize qw/aom_highbd_comp_mask_pred avx2/; + add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; + specialize qw/aom_highbd_comp_mask_pred sse2 avx2/; } # CONFIG_AV1_ENCODER diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h index fd4f51b29..00686ac38 100644 --- a/third_party/aom/aom_dsp/aom_filter.h +++ b/third_party/aom/aom_dsp/aom_filter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_AOM_FILTER_H_ -#define AOM_DSP_AOM_FILTER_H_ +#ifndef AOM_AOM_DSP_AOM_FILTER_H_ +#define AOM_AOM_DSP_AOM_FILTER_H_ #include "aom/aom_integer.h" @@ -53,4 +53,4 @@ static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = { } // extern "C" #endif -#endif // AOM_DSP_AOM_FILTER_H_ +#endif // AOM_AOM_DSP_AOM_FILTER_H_ diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h index 392b36627..ab950ca55 100644 --- a/third_party/aom/aom_dsp/aom_simd.h +++ b/third_party/aom/aom_dsp/aom_simd.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_AOM_AOM_SIMD_H_ -#define AOM_DSP_AOM_AOM_SIMD_H_ +#ifndef AOM_AOM_DSP_AOM_SIMD_H_ +#define AOM_AOM_DSP_AOM_SIMD_H_ #include @@ -35,4 +35,4 @@ #include "simd/v256_intrinsics.h" #endif -#endif // AOM_DSP_AOM_AOM_SIMD_H_ +#endif // AOM_AOM_DSP_AOM_SIMD_H_ diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h index 02a8b3a17..eb333f6f6 100644 --- a/third_party/aom/aom_dsp/aom_simd_inline.h +++ b/third_party/aom/aom_dsp/aom_simd_inline.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_AOM_SIMD_INLINE_H_ -#define AOM_DSP_AOM_SIMD_INLINE_H_ +#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_ +#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_ #include "aom/aom_integer.h" @@ -18,4 +18,4 @@ #define SIMD_INLINE static AOM_FORCE_INLINE #endif -#endif // AOM_DSP_AOM_SIMD_INLINE_H_ +#endif // AOM_AOM_DSP_AOM_SIMD_INLINE_H_ diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c index 82c0b0e28..e7f08a5fd 100644 --- a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c +++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c @@ -86,7 +86,8 @@ static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride, const int16x8_t vec_round_bits) { int16x8_t src0_0, src0_1; int16x8_t src1_0, src1_1; - uint64x2_t tu0, tu1, tu2, tu3; + uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0), + tu3 = vdupq_n_u64(0); int16x8_t mask0_1, mask2_3; int16x8_t res0, res1; @@ -154,7 +155,8 @@ void aom_lowbd_blend_a64_d16_mask_neon( assert(IS_POWER_OF_TWO(w)); uint8x8_t s0, s1, s2, s3; - uint32x2_t tu0, tu1, tu2, tu3; + uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0), + tu3 = vdup_n_u32(0); uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; int16x8_t mask0, mask1, mask2, mask3; int16x8_t mask4, mask5, mask6, mask7; diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c index 44d821821..cf618eee7 100644 --- a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c +++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c @@ -17,13 +17,9 @@ #include "aom_ports/mem.h" #include "aom/aom_integer.h" +#include "aom_dsp/aom_filter.h" #include "aom_dsp/variance.h" -static const uint8_t bilinear_filters[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, @@ -83,9 +79,9 @@ unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, - bilinear_filters[xoffset]); + bilinear_filters_2t[xoffset]); var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, - bilinear_filters[yoffset]); + bilinear_filters_2t[yoffset]); return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse); } @@ -98,9 +94,9 @@ unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src, DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, - bilinear_filters[xoffset]); + bilinear_filters_2t[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, - bilinear_filters[yoffset]); + bilinear_filters_2t[yoffset]); return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse); } @@ -113,9 +109,9 @@ unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src, DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, - bilinear_filters[xoffset]); + bilinear_filters_2t[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, - bilinear_filters[yoffset]); + bilinear_filters_2t[yoffset]); return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse); } @@ -128,8 +124,8 @@ unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src, DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, - bilinear_filters[xoffset]); + bilinear_filters_2t[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, - bilinear_filters[yoffset]); + bilinear_filters_2t[yoffset]); return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse); } diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c index d05c3efdc..01088010a 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.c +++ b/third_party/aom/aom_dsp/binary_codes_reader.c @@ -36,7 +36,7 @@ static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) { uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM) { if (n <= 1) return 0; - const int l = get_msb(n - 1) + 1; + const int l = get_msb(n) + 1; const int m = (1 << l) - n; const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME); return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME); @@ -45,7 +45,7 @@ uint16_t aom_read_primitive_quniform_(aom_reader *r, static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb, uint16_t n) { if (n <= 1) return 0; - const int l = get_msb(n - 1) + 1; + const int l = get_msb(n) + 1; const int m = (1 << l) - n; const int v = aom_rb_read_literal(rb, l - 1); return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb); @@ -121,13 +121,3 @@ int16_t aom_rb_read_signed_primitive_refsubexpfin( const uint16_t scaled_n = (n << 1) - 1; return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1; } - -uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) { - int leading_zeros = 0; - while (!aom_rb_read_bit(rb)) ++leading_zeros; - // Maximum 32 bits. - if (leading_zeros >= 32) return UINT32_MAX; - const uint32_t base = (1u << leading_zeros) - 1; - const uint32_t value = aom_rb_read_literal(rb, leading_zeros); - return base + value; -} diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h index 5253c6154..364a67469 100644 --- a/third_party/aom/aom_dsp/binary_codes_reader.h +++ b/third_party/aom/aom_dsp/binary_codes_reader.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BINARY_CODES_READER_H_ -#define AOM_DSP_BINARY_CODES_READER_H_ +#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_ +#define AOM_AOM_DSP_BINARY_CODES_READER_H_ #ifdef __cplusplus extern "C" { @@ -40,10 +40,8 @@ uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k, int16_t aom_rb_read_signed_primitive_refsubexpfin( struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref); -uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb); - #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_DSP_BINARY_CODES_READER_H_ +#endif // AOM_AOM_DSP_BINARY_CODES_READER_H_ diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c index 8f74f0942..ee7a9f567 100644 --- a/third_party/aom/aom_dsp/binary_codes_writer.c +++ b/third_party/aom/aom_dsp/binary_codes_writer.c @@ -59,7 +59,7 @@ int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) { // Encodes a value v in [0, n-1] quasi-uniformly void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { if (n <= 1) return; - const int l = get_msb(n - 1) + 1; + const int l = get_msb(n) + 1; const int m = (1 << l) - n; if (v < m) { aom_write_literal(w, v, l - 1); @@ -72,7 +72,7 @@ void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) { static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb, uint16_t n, uint16_t v) { if (n <= 1) return; - const int l = get_msb(n - 1) + 1; + const int l = get_msb(n) + 1; const int m = (1 << l) - n; if (v < m) { aom_wb_write_literal(wb, v, l - 1); @@ -84,7 +84,7 @@ static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb, int aom_count_primitive_quniform(uint16_t n, uint16_t v) { if (n <= 1) return 0; - const int l = get_msb(n - 1) + 1; + const int l = get_msb(n) + 1; const int m = (1 << l) - n; return v < m ? l - 1 : l; } @@ -208,15 +208,3 @@ int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, const uint16_t scaled_n = (n << 1) - 1; return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v); } - -void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) { - int64_t shift_val = ++v; - int leading_zeroes = 1; - - assert(shift_val > 0); - - while (shift_val >>= 1) leading_zeroes += 2; - - aom_wb_write_literal(wb, 0, leading_zeroes >> 1); - aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1); -} diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h index 784c721a6..c360e0e29 100644 --- a/third_party/aom/aom_dsp/binary_codes_writer.h +++ b/third_party/aom/aom_dsp/binary_codes_writer.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BINARY_CODES_WRITER_H_ -#define AOM_DSP_BINARY_CODES_WRITER_H_ +#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_ +#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_ #ifdef __cplusplus extern "C" { @@ -61,9 +61,8 @@ int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref, uint16_t v); int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref, int16_t v); -void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v); #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_DSP_BINARY_CODES_WRITER_H_ +#endif // AOM_AOM_DSP_BINARY_CODES_WRITER_H_ diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h index 328935be9..7c0efcc78 100644 --- a/third_party/aom/aom_dsp/bitreader.h +++ b/third_party/aom/aom_dsp/bitreader.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BITREADER_H_ -#define AOM_DSP_BITREADER_H_ +#ifndef AOM_AOM_DSP_BITREADER_H_ +#define AOM_AOM_DSP_BITREADER_H_ #include #include @@ -69,6 +69,12 @@ static INLINE int aom_reader_has_error(aom_reader *r) { return aom_daala_reader_has_error(r); } +// Returns true if the bit reader has tried to decode more data from the buffer +// than was actually provided. +static INLINE int aom_reader_has_overflowed(const aom_reader *r) { + return aom_daala_reader_has_overflowed(r); +} + // Returns the position in the bit reader in bits. static INLINE uint32_t aom_reader_tell(const aom_reader *r) { return aom_daala_reader_tell(r); @@ -151,4 +157,4 @@ static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf, } // extern "C" #endif -#endif // AOM_DSP_BITREADER_H_ +#endif // AOM_AOM_DSP_BITREADER_H_ diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c index 02b5ef924..b53211784 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.c +++ b/third_party/aom/aom_dsp/bitreader_buffer.c @@ -55,3 +55,13 @@ int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) { const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits; return ((int)value) >> nbits; } + +uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) { + int leading_zeros = 0; + while (!aom_rb_read_bit(rb)) ++leading_zeros; + // Maximum 32 bits. + if (leading_zeros >= 32) return UINT32_MAX; + const uint32_t base = (1u << leading_zeros) - 1; + const uint32_t value = aom_rb_read_literal(rb, leading_zeros); + return base + value; +} diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h index 5c94ab883..725ca1ea2 100644 --- a/third_party/aom/aom_dsp/bitreader_buffer.h +++ b/third_party/aom/aom_dsp/bitreader_buffer.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BITREADER_BUFFER_H_ -#define AOM_DSP_BITREADER_BUFFER_H_ +#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_ +#define AOM_AOM_DSP_BITREADER_BUFFER_H_ #include @@ -41,8 +41,10 @@ uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits); int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits); +uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb); + #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_DSP_BITREADER_BUFFER_H_ +#endif // AOM_AOM_DSP_BITREADER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h index de1b1d048..b5ecc2382 100644 --- a/third_party/aom/aom_dsp/bitwriter.h +++ b/third_party/aom/aom_dsp/bitwriter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BITWRITER_H_ -#define AOM_DSP_BITWRITER_H_ +#ifndef AOM_AOM_DSP_BITWRITER_H_ +#define AOM_AOM_DSP_BITWRITER_H_ #include @@ -86,4 +86,4 @@ static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf, } // extern "C" #endif -#endif // AOM_DSP_BITWRITER_H_ +#endif // AOM_AOM_DSP_BITWRITER_H_ diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c index a563bf684..596246deb 100644 --- a/third_party/aom/aom_dsp/bitwriter_buffer.c +++ b/third_party/aom/aom_dsp/bitwriter_buffer.c @@ -73,3 +73,15 @@ void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, int bits) { aom_wb_write_literal(wb, data, bits + 1); } + +void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) { + int64_t shift_val = ++v; + int leading_zeroes = 1; + + assert(shift_val > 0); + + while (shift_val >>= 1) leading_zeroes += 2; + + aom_wb_write_literal(wb, 0, leading_zeroes >> 1); + aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1); +} diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h index f7f75a097..d0311284f 100644 --- a/third_party/aom/aom_dsp/bitwriter_buffer.h +++ b/third_party/aom/aom_dsp/bitwriter_buffer.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BITWRITER_BUFFER_H_ -#define AOM_DSP_BITWRITER_BUFFER_H_ +#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_ +#define AOM_AOM_DSP_BITWRITER_BUFFER_H_ #include "aom/aom_integer.h" @@ -42,8 +42,10 @@ void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data, void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data, int bits); +void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v); + #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_DSP_BITWRITER_BUFFER_H_ +#endif // AOM_AOM_DSP_BITWRITER_BUFFER_H_ diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h index 434bb83a1..fd87dc181 100644 --- a/third_party/aom/aom_dsp/blend.h +++ b/third_party/aom/aom_dsp/blend.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BLEND_H_ -#define AOM_DSP_BLEND_H_ +#ifndef AOM_AOM_DSP_BLEND_H_ +#define AOM_AOM_DSP_BLEND_H_ #include "aom_ports/mem.h" @@ -42,4 +42,4 @@ #define DIFF_FACTOR_LOG2 4 #define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2) -#endif // AOM_DSP_BLEND_H_ +#endif // AOM_AOM_DSP_BLEND_H_ diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h index cf7df1dbf..985fcdf9e 100644 --- a/third_party/aom/aom_dsp/buf_ans.h +++ b/third_party/aom/aom_dsp/buf_ans.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_BUF_ANS_H_ -#define AOM_DSP_BUF_ANS_H_ +#ifndef AOM_AOM_DSP_BUF_ANS_H_ +#define AOM_AOM_DSP_BUF_ANS_H_ // Buffered forward ANS writer. // Symbols are written to the writer in forward (decode) order and serialized // backwards due to ANS's stack like behavior. @@ -133,4 +133,4 @@ static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) { #ifdef __cplusplus } // extern "C" #endif // __cplusplus -#endif // AOM_DSP_BUF_ANS_H_ +#endif // AOM_AOM_DSP_BUF_ANS_H_ diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c index 4e224904e..6c2259f23 100644 --- a/third_party/aom/aom_dsp/daalaboolreader.c +++ b/third_party/aom/aom_dsp/daalaboolreader.c @@ -39,3 +39,9 @@ uint32_t aom_daala_reader_tell(const daala_reader *r) { uint32_t aom_daala_reader_tell_frac(const daala_reader *r) { return od_ec_dec_tell_frac(&r->ec); } + +int aom_daala_reader_has_overflowed(const daala_reader *r) { + const uint32_t tell_bits = aom_daala_reader_tell(r); + const uint32_t tell_bytes = (tell_bits + 7) >> 3; + return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer); +} diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h index 60c197a49..ba78f916d 100644 --- a/third_party/aom/aom_dsp/daalaboolreader.h +++ b/third_party/aom/aom_dsp/daalaboolreader.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_DAALABOOLREADER_H_ -#define AOM_DSP_DAALABOOLREADER_H_ +#ifndef AOM_AOM_DSP_DAALABOOLREADER_H_ +#define AOM_AOM_DSP_DAALABOOLREADER_H_ #include "aom/aom_integer.h" #include "aom_dsp/entdec.h" @@ -44,6 +44,9 @@ const uint8_t *aom_daala_reader_find_begin(daala_reader *r); const uint8_t *aom_daala_reader_find_end(daala_reader *r); uint32_t aom_daala_reader_tell(const daala_reader *r); uint32_t aom_daala_reader_tell_frac(const daala_reader *r); +// Returns true if the reader has tried to decode more data from the buffer +// than was actually provided. +int aom_daala_reader_has_overflowed(const daala_reader *r); static INLINE int aom_daala_read(daala_reader *r, int prob) { int bit; @@ -154,4 +157,4 @@ static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf, } // extern "C" #endif -#endif +#endif // AOM_AOM_DSP_DAALABOOLREADER_H_ diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h index f9c596c73..3848877ce 100644 --- a/third_party/aom/aom_dsp/daalaboolwriter.h +++ b/third_party/aom/aom_dsp/daalaboolwriter.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_DAALABOOLWRITER_H_ -#define AOM_DSP_DAALABOOLWRITER_H_ +#ifndef AOM_AOM_DSP_DAALABOOLWRITER_H_ +#define AOM_AOM_DSP_DAALABOOLWRITER_H_ #include @@ -75,4 +75,4 @@ static INLINE void daala_write_symbol(daala_writer *w, int symb, } // extern "C" #endif -#endif +#endif // AOM_AOM_DSP_DAALABOOLWRITER_H_ diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h index 5c15526e9..7ba2b1c39 100644 --- a/third_party/aom/aom_dsp/entcode.h +++ b/third_party/aom/aom_dsp/entcode.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_ENTCODE_H_ -#define AOM_DSP_ENTCODE_H_ +#ifndef AOM_AOM_DSP_ENTCODE_H_ +#define AOM_AOM_DSP_ENTCODE_H_ #include #include @@ -37,4 +37,4 @@ typedef uint32_t od_ec_window; OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng); -#endif // AOM_DSP_ENTCODE_H_ +#endif // AOM_AOM_DSP_ENTCODE_H_ diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c index b8e9078c3..d1764c47b 100644 --- a/third_party/aom/aom_dsp/entdec.c +++ b/third_party/aom/aom_dsp/entdec.c @@ -112,6 +112,7 @@ static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng, int ret) { int d; assert(rng <= 65535U); + // The number of leading zeros in the 16-bit binary representation of rng. d = 16 - OD_ILOG_NZ(rng); dec->cnt -= d; /*This is equivalent to shifting in 1's instead of 0's.*/ diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h index e35c3f99f..283bf1831 100644 --- a/third_party/aom/aom_dsp/entdec.h +++ b/third_party/aom/aom_dsp/entdec.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#if !defined(_entdec_H) -#define _entdec_H (1) +#ifndef AOM_AOM_DSP_ENTDEC_H_ +#define AOM_AOM_DSP_ENTDEC_H_ #include #include "aom_dsp/entcode.h" @@ -80,4 +80,4 @@ OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) } // extern "C" #endif -#endif +#endif // AOM_AOM_DSP_ENTDEC_H_ diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c index 6866de9b9..a61da263c 100644 --- a/third_party/aom/aom_dsp/entenc.c +++ b/third_party/aom/aom_dsp/entenc.c @@ -60,6 +60,7 @@ static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low, int s; c = enc->cnt; assert(rng <= 65535U); + // The number of leading zeros in the 16-bit binary representation of rng. d = 16 - OD_ILOG_NZ(rng); s = c + d; /*TODO: Right now we flush every time we have at least one byte available. diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h index 1988f6818..3551d4250 100644 --- a/third_party/aom/aom_dsp/entenc.h +++ b/third_party/aom/aom_dsp/entenc.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#if !defined(_entenc_H) -#define _entenc_H (1) +#ifndef AOM_AOM_DSP_ENTENC_H_ +#define AOM_AOM_DSP_ENTENC_H_ #include #include "aom_dsp/entcode.h" @@ -82,4 +82,4 @@ void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src); } // extern "C" #endif -#endif +#endif // AOM_AOM_DSP_ENTENC_H_ diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h index 2f3cd5fdc..5137331ae 100644 --- a/third_party/aom/aom_dsp/fft_common.h +++ b/third_party/aom/aom_dsp/fft_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_FFT_COMMON_H_ -#define AOM_DSP_FFT_COMMON_H_ +#ifndef AOM_AOM_DSP_FFT_COMMON_H_ +#define AOM_AOM_DSP_FFT_COMMON_H_ #ifdef __cplusplus extern "C" { @@ -1047,4 +1047,4 @@ void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n, add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \ } -#endif // AOM_DSP_FFT_COMMON_H_ +#endif // AOM_AOM_DSP_FFT_COMMON_H_ diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c index ff1ec41a2..b96e1c319 100644 --- a/third_party/aom/aom_dsp/grain_synthesis.c +++ b/third_party/aom/aom_dsp/grain_synthesis.c @@ -396,14 +396,15 @@ static void init_random_generator(int luma_line, uint16_t seed) { random_register ^= ((luma_num * 173 + 105) & 255); } -static void generate_luma_grain_block( +// Return 0 for success, -1 for failure +static int generate_luma_grain_block( const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block, int luma_block_size_y, int luma_block_size_x, int luma_grain_stride, int left_pad, int top_pad, int right_pad, int bottom_pad) { if (params->num_y_points == 0) { memset(luma_grain_block, 0, sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride); - return; + return 0; } int bit_depth = params->bit_depth; @@ -433,9 +434,11 @@ static void generate_luma_grain_block( ((wsum + rounding_offset) >> params->ar_coeff_shift), grain_min, grain_max); } + return 0; } -static void generate_chroma_grain_blocks( +// Return 0 for success, -1 for failure +static int generate_chroma_grain_blocks( const aom_film_grain_t *params, // int** pred_pos_luma, int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block, @@ -510,10 +513,11 @@ static void generate_chroma_grain_blocks( wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma; wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma; } else { - printf( + fprintf( + stderr, "Grain synthesis: prediction between two chroma components is " "not supported!"); - exit(1); + return -1; } } if (params->num_cb_points || params->chroma_scaling_from_luma) @@ -527,6 +531,7 @@ static void generate_chroma_grain_blocks( ((wsum_cr + rounding_offset) >> params->ar_coeff_shift), grain_min, grain_max); } + return 0; } static void init_scaling_function(const int scaling_points[][2], int num_points, @@ -910,8 +915,8 @@ static void hor_boundary_overlap(int *top_block, int top_stride, } } -void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, - aom_image_t *dst) { +int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, + aom_image_t *dst) { uint8_t *luma, *cb, *cr; int height, width, luma_stride, chroma_stride; int use_high_bit_depth = 0; @@ -953,8 +958,8 @@ void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, chroma_subsamp_y = 0; break; default: // unknown input format - printf("Film grain error: input format is not supported!"); - exit(1); + fprintf(stderr, "Film grain error: input format is not supported!"); + return -1; } assert(params->bit_depth == src->bit_depth); @@ -1011,17 +1016,16 @@ void av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src, luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth; chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth; - av1_add_film_grain_run(params, luma, cb, cr, height, width, luma_stride, - chroma_stride, use_high_bit_depth, chroma_subsamp_y, - chroma_subsamp_x, mc_identity); - return; + return av1_add_film_grain_run( + params, luma, cb, cr, height, width, luma_stride, chroma_stride, + use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity); } -void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, - uint8_t *cb, uint8_t *cr, int height, int width, - int luma_stride, int chroma_stride, - int use_high_bit_depth, int chroma_subsamp_y, - int chroma_subsamp_x, int mc_identity) { +int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int height, int width, + int luma_stride, int chroma_stride, + int use_high_bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity) { int **pred_pos_luma; int **pred_pos_chroma; int *luma_grain_block; @@ -1085,18 +1089,20 @@ void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y, chroma_subsamp_x); - generate_luma_grain_block(params, pred_pos_luma, luma_grain_block, - luma_block_size_y, luma_block_size_x, - luma_grain_stride, left_pad, top_pad, right_pad, - bottom_pad); - - generate_chroma_grain_blocks( - params, - // pred_pos_luma, - pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block, - luma_grain_stride, chroma_block_size_y, chroma_block_size_x, - chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad, - chroma_subsamp_y, chroma_subsamp_x); + if (generate_luma_grain_block(params, pred_pos_luma, luma_grain_block, + luma_block_size_y, luma_block_size_x, + luma_grain_stride, left_pad, top_pad, right_pad, + bottom_pad)) + return -1; + + if (generate_chroma_grain_blocks( + params, + // pred_pos_luma, + pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block, + luma_grain_stride, chroma_block_size_y, chroma_block_size_x, + chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad, + chroma_subsamp_y, chroma_subsamp_x)) + return -1; init_scaling_function(params->scaling_points_y, params->num_y_points, scaling_lut_y); @@ -1399,4 +1405,5 @@ void av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block, &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf); + return 0; } diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h index 65feb6068..7aee6f6f4 100644 --- a/third_party/aom/aom_dsp/grain_synthesis.h +++ b/third_party/aom/aom_dsp/grain_synthesis.h @@ -13,8 +13,8 @@ * \brief Describes film grain parameters and film grain synthesis * */ -#ifndef AOM_AOM_GRAIN_SYNTHESIS_H_ -#define AOM_AOM_GRAIN_SYNTHESIS_H_ +#ifndef AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ +#define AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ #ifdef __cplusplus extern "C" { @@ -85,6 +85,8 @@ typedef struct { * * Add film grain to an image * + * Returns 0 for success, -1 for failure + * * \param[in] grain_params Grain parameters * \param[in] luma luma plane * \param[in] cb cb plane @@ -94,25 +96,27 @@ typedef struct { * \param[in] luma_stride luma plane stride * \param[in] chroma_stride chroma plane stride */ -void av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma, - uint8_t *cb, uint8_t *cr, int height, int width, - int luma_stride, int chroma_stride, - int use_high_bit_depth, int chroma_subsamp_y, - int chroma_subsamp_x, int mc_identity); +int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma, + uint8_t *cb, uint8_t *cr, int height, int width, + int luma_stride, int chroma_stride, + int use_high_bit_depth, int chroma_subsamp_y, + int chroma_subsamp_x, int mc_identity); /*!\brief Add film grain * * Add film grain to an image * + * Returns 0 for success, -1 for failure + * * \param[in] grain_params Grain parameters * \param[in] src Source image * \param[out] dst Resulting image with grain */ -void av1_add_film_grain(const aom_film_grain_t *grain_params, - const aom_image_t *src, aom_image_t *dst); +int av1_add_film_grain(const aom_film_grain_t *grain_params, + const aom_image_t *src, aom_image_t *dst); #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_AOM_GRAIN_SYNTHESIS_H_ +#endif // AOM_AOM_DSP_GRAIN_SYNTHESIS_H_ diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h index 5c20413b2..a8ac50730 100644 --- a/third_party/aom/aom_dsp/grain_table.h +++ b/third_party/aom/aom_dsp/grain_table.h @@ -99,4 +99,4 @@ void aom_film_grain_table_free(aom_film_grain_table_t *t); } #endif -#endif +#endif // AOM_AOM_DSP_GRAIN_TABLE_H_ diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h index e047d98bc..3ec62a86e 100644 --- a/third_party/aom/aom_dsp/intrapred_common.h +++ b/third_party/aom/aom_dsp/intrapred_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_INTRAPRED_COMMON_H -#define _AOM_DSP_INTRAPRED_COMMON_H +#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_ +#define AOM_AOM_DSP_INTRAPRED_COMMON_H_ #include "config/aom_config.h" @@ -44,4 +44,4 @@ static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = { }; /* clang-format on */ -#endif // _AOM_DSP_INTRAPRED_COMMON_H +#endif // AOM_AOM_DSP_INTRAPRED_COMMON_H_ diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h index a0627c074..852415c20 100644 --- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h +++ b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ -#define AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ +#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ +#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ #include "aom_dsp/mips/macros_msa.h" #include "aom_dsp/aom_filter.h" @@ -76,4 +76,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; res7_m, out0, out1, out2, out3); \ } -#endif /* AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ */ +#endif // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h index d51bfa899..c42188d62 100644 --- a/third_party/aom/aom_dsp/mips/common_dspr2.h +++ b/third_party/aom/aom_dsp/mips/common_dspr2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_COMMON_MIPS_DSPR2_H_ -#define AOM_COMMON_MIPS_DSPR2_H_ +#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ #include @@ -48,4 +48,4 @@ static INLINE void prefetch_store_streamed(unsigned char *dst) { } // extern "C" #endif -#endif // AOM_COMMON_MIPS_DSPR2_H_ +#endif // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c index 2a8f75938..097da73ca 100644 --- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c @@ -15,7 +15,6 @@ #include "config/aom_dsp_rtcd.h" #include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c index ac87936da..40abfd89e 100644 --- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c +++ b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c @@ -15,7 +15,6 @@ #include "config/aom_dsp_rtcd.h" #include "aom_dsp/mips/convolve_common_dspr2.h" -#include "aom_dsp/aom_convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_ports/mem.h" diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h index e7b8d531b..e5d48a884 100644 --- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h +++ b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ -#define AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ +#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ #include @@ -45,4 +45,4 @@ void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, } // extern "C" #endif -#endif // AOM_DSP_MIPS_AOM_COMMON_DSPR2_H_ +#endif // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h index 3e38ef3fb..28f0dc35a 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ -#define AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ #include @@ -733,4 +733,4 @@ static INLINE void wide_mbfilter_dspr2( } // extern "C" #endif -#endif // AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h index cb599cf2e..62295d69d 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ -#define AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ #include @@ -434,4 +434,4 @@ extern "C" { } // extern "C" #endif -#endif // AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h index 6db1dac08..a0f57f386 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ -#define AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ #include @@ -354,4 +354,4 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1, } // extern "C" #endif -#endif // AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_ diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h index 450594262..54b0bb4bd 100644 --- a/third_party/aom/aom_dsp/mips/loopfilter_msa.h +++ b/third_party/aom/aom_dsp/mips/loopfilter_msa.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_LOOPFILTER_MSA_H_ -#define AOM_DSP_LOOPFILTER_MSA_H_ +#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ +#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ #include "aom_dsp/mips/macros_msa.h" @@ -248,4 +248,4 @@ mask_out = limit_in < (v16u8)mask_out; \ mask_out = __msa_xori_b(mask_out, 0xff); \ } -#endif /* AOM_DSP_LOOPFILTER_MSA_H_ */ +#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h index eb919d42b..9bfc27147 100644 --- a/third_party/aom/aom_dsp/mips/macros_msa.h +++ b/third_party/aom/aom_dsp/mips/macros_msa.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_MIPS_MACROS_MSA_H_ -#define AOM_DSP_MIPS_MACROS_MSA_H_ +#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_ +#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_ #include @@ -2055,4 +2055,4 @@ \ tmp1_m; \ }) -#endif /* AOM_DSP_MIPS_MACROS_MSA_H_ */ +#endif // AOM_AOM_DSP_MIPS_MACROS_MSA_H_ diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c index a8ee85b6b..810b6efaa 100644 --- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c +++ b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c @@ -13,13 +13,9 @@ #include "aom_ports/mem.h" #include "aom_dsp/mips/macros_msa.h" +#include "aom_dsp/aom_filter.h" #include "aom_dsp/variance.h" -static const uint8_t bilinear_filters_msa[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - #define CALC_MSE_AVG_B(src, ref, var, sub) \ { \ v16u8 src_l0_m, src_l1_m; \ @@ -1626,8 +1622,8 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa( uint32_t *sse) { \ int32_t diff; \ uint32_t var; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ \ if (yoffset) { \ if (xoffset) { \ @@ -1680,8 +1676,8 @@ AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64) int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ uint32_t *sse, const uint8_t *sec_pred) { \ int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ \ if (yoffset) { \ if (xoffset) { \ @@ -1730,8 +1726,8 @@ uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, int32_t ref_stride, uint32_t *sse, const uint8_t *sec_pred) { int32_t diff; - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; if (yoffset) { if (xoffset) { @@ -1763,8 +1759,8 @@ uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr, int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \ uint32_t *sse, const uint8_t *sec_pred) { \ int32_t diff; \ - const uint8_t *h_filter = bilinear_filters_msa[xoffset]; \ - const uint8_t *v_filter = bilinear_filters_msa[yoffset]; \ + const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \ + const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \ \ if (yoffset) { \ if (xoffset) { \ diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c index 5975c62e8..2faee8506 100644 --- a/third_party/aom/aom_dsp/noise_model.c +++ b/third_party/aom/aom_dsp/noise_model.c @@ -1135,7 +1135,9 @@ int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model, fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag); return 0; } + uint16_t random_seed = film_grain->random_seed; memset(film_grain, 0, sizeof(*film_grain)); + film_grain->random_seed = random_seed; film_grain->apply_grain = 1; film_grain->update_parameters = 1; @@ -1633,7 +1635,7 @@ int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, return 0; } if (!film_grain->random_seed) { - film_grain->random_seed = 1071; + film_grain->random_seed = 7391; } memcpy(raw_data[0], ctx->denoised[0], (strides[0] * sd->y_height) << use_highbd); diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h index b07bf8617..049d5be15 100644 --- a/third_party/aom/aom_dsp/noise_model.h +++ b/third_party/aom/aom_dsp/noise_model.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_NOISE_MODEL_H_ -#define AOM_DSP_NOISE_MODEL_H_ +#ifndef AOM_AOM_DSP_NOISE_MODEL_H_ +#define AOM_AOM_DSP_NOISE_MODEL_H_ #ifdef __cplusplus extern "C" { @@ -320,4 +320,4 @@ void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model); #ifdef __cplusplus } // extern "C" #endif // __cplusplus -#endif // AOM_DSP_NOISE_MODEL_H_ +#endif // AOM_AOM_DSP_NOISE_MODEL_H_ diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h index ea4d9e3de..2284a171a 100644 --- a/third_party/aom/aom_dsp/noise_util.h +++ b/third_party/aom/aom_dsp/noise_util.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_NOISE_UTIL_H_ -#define AOM_DSP_NOISE_UTIL_H_ +#ifndef AOM_AOM_DSP_NOISE_UTIL_H_ +#define AOM_AOM_DSP_NOISE_UTIL_H_ #ifdef __cplusplus extern "C" { @@ -65,4 +65,4 @@ int aom_noise_data_validate(const double *data, int w, int h); } // extern "C" #endif // __cplusplus -#endif // AOM_DSP_NOISE_UTIL_H_ +#endif // AOM_AOM_DSP_NOISE_UTIL_H_ diff --git a/third_party/aom/aom_dsp/postproc.h b/third_party/aom/aom_dsp/postproc.h index 11a8c5ad7..f3d87f264 100644 --- a/third_party/aom/aom_dsp/postproc.h +++ b/third_party/aom/aom_dsp/postproc.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_POSTPROC_H_ -#define AOM_DSP_POSTPROC_H_ +#ifndef AOM_AOM_DSP_POSTPROC_H_ +#define AOM_AOM_DSP_POSTPROC_H_ #ifdef __cplusplus extern "C" { @@ -23,4 +23,4 @@ int aom_setup_noise(double sigma, int size, char *noise); } #endif -#endif // AOM_DSP_POSTPROC_H_ +#endif // AOM_AOM_DSP_POSTPROC_H_ diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h index 85dd4249d..d003a986e 100644 --- a/third_party/aom/aom_dsp/prob.h +++ b/third_party/aom/aom_dsp/prob.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_PROB_H_ -#define AOM_DSP_PROB_H_ +#ifndef AOM_AOM_DSP_PROB_H_ +#define AOM_AOM_DSP_PROB_H_ #include #include @@ -668,4 +668,4 @@ static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { } // extern "C" #endif -#endif // AOM_DSP_PROB_H_ +#endif // AOM_AOM_DSP_PROB_H_ diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c index 37d3bb585..50f376a4a 100644 --- a/third_party/aom/aom_dsp/psnr.c +++ b/third_party/aom/aom_dsp/psnr.c @@ -295,7 +295,6 @@ int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a, default: assert(plane >= 0 && plane <= 2); return 0; } } - (void)highbd; switch (plane) { case 0: return aom_get_y_sse(a, b); case 1: return aom_get_u_sse(a, b); diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h index 8300b0a88..58e4e71ee 100644 --- a/third_party/aom/aom_dsp/psnr.h +++ b/third_party/aom/aom_dsp/psnr.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_PSNR_H_ -#define AOM_DSP_PSNR_H_ +#ifndef AOM_AOM_DSP_PSNR_H_ +#define AOM_AOM_DSP_PSNR_H_ #include "aom_scale/yv12config.h" @@ -76,4 +76,4 @@ double aom_psnrhvs(const YV12_BUFFER_CONFIG *source, #ifdef __cplusplus } // extern "C" #endif -#endif // AOM_DSP_PSNR_H_ +#endif // AOM_AOM_DSP_PSNR_H_ diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c index e1601cc7d..62dbd86a9 100644 --- a/third_party/aom/aom_dsp/quantize.c +++ b/third_party/aom/aom_dsp/quantize.c @@ -13,8 +13,8 @@ #include "aom_mem/aom_mem.h" void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, @@ -29,56 +29,54 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = (int)n_coeffs - 1; i >= 0; i--) { - const int rc = scan[i]; - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - const int coeff = coeff_ptr[rc] * wt; - - if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) && - coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS))) - non_zero_count--; - else - break; - } + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS))) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32; + + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + int64_t tmp = + clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp *= wt; + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); // quantization + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); - // Quantization pass: All coefficients with index >= zero_flag are - // skippable. Note: zero_flag can be zero. - for (i = 0; i < non_zero_count; i++) { - const int rc = scan[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - int tmp32; - - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { - int64_t tmp = - clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), - INT16_MIN, INT16_MAX); - tmp *= wt; - tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * - quant_shift_ptr[rc != 0]) >> - (16 - log_scale + AOM_QM_BITS)); // quantization - qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; - const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); - const int dequant = - (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; - dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); - - if (tmp32) eob = i; - } + if (tmp32) eob = i; } } *eob_ptr = eob + 1; } void highbd_quantize_b_helper_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, @@ -95,42 +93,40 @@ void highbd_quantize_b_helper_c( memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs; i++) { - const int rc = scan[i]; - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - const int coeff = coeff_ptr[rc] * wt; - - // If the coefficient is out of the base ZBIN range, keep it for - // quantization. - if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) || - coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS))) - idx_arr[idx++] = i; - } + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) || + coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS))) + idx_arr[idx++] = i; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = scan[idx_arr[i]]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); - const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); - const int64_t tmpw = tmp1 * wt; - const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; - const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> - (16 - log_scale + AOM_QM_BITS)); - qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); - dequant = (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> - AOM_QM_BITS; - const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; - dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); - if (abs_qcoeff) eob = idx_arr[i]; - } + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) eob = idx_arr[i]; } *eob_ptr = eob + 1; } @@ -138,74 +134,73 @@ void highbd_quantize_b_helper_c( /* These functions should only be called when quantisation matrices are not used. */ void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 0); } void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); + quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 1); } void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, round_ptr, - quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, - dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); + quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, + quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan, NULL, NULL, 2); } void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, - round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 0); } void aom_highbd_quantize_b_32x32_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, - round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 1); } void aom_highbd_quantize_b_64x64_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, skip_block, zbin_ptr, - round_ptr, quant_ptr, quant_shift_ptr, qcoeff_ptr, + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, NULL, NULL, 2); } diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h index 56d50b929..c55ab234e 100644 --- a/third_party/aom/aom_dsp/quantize.h +++ b/third_party/aom/aom_dsp/quantize.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_QUANTIZE_H_ -#define AOM_DSP_QUANTIZE_H_ +#ifndef AOM_AOM_DSP_QUANTIZE_H_ +#define AOM_AOM_DSP_QUANTIZE_H_ #include "config/aom_config.h" @@ -21,8 +21,8 @@ extern "C" { #endif void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, @@ -30,24 +30,23 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const qm_val_t *iqm_ptr, const int log_scale); void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, - const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, - tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan, - const int16_t *iscan); + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); void highbd_quantize_b_helper_c( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale); void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, @@ -57,4 +56,4 @@ void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } // extern "C" #endif -#endif // AOM_DSP_QUANTIZE_H_ +#endif // AOM_AOM_DSP_QUANTIZE_H_ diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c index ede4c583b..1e24df4a5 100644 --- a/third_party/aom/aom_dsp/sad.c +++ b/third_party/aom/aom_dsp/sad.c @@ -200,15 +200,16 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred) { \ uint16_t comp_pred[m * n]; \ - aom_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, m, n, \ + ref, ref_stride); \ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ } \ unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ uint16_t comp_pred[m * n]; \ - aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, \ - ref_stride, jcp_param); \ + aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, \ + m, n, ref, ref_stride, jcp_param); \ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ } diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h index 51a38a7e1..01dbb8fd2 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V128_INTRINSICS_H -#define _V128_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ #include #include @@ -341,4 +341,4 @@ SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { return c_v128_ssd_s16_sum(s); } -#endif /* _V128_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h index d4fec4237..3c669d579 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V128_INTRINSICS_H -#define _V128_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ #include @@ -375,7 +375,8 @@ SIMD_INLINE uint32_t v128_movemask_8(v128 a) { uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8( vandq_u8(vreinterpretq_u8_s64(a), vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)))))); - return v64_u64(v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m))); + return v64_low_u32( + v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m))); #endif } @@ -954,4 +955,4 @@ SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); } -#endif /* _V128_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h index e508f6ad7..bbe9a9d28 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V128_INTRINSICS_C_H -#define _V128_INTRINSICS_C_H +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ #include #include @@ -885,4 +885,4 @@ SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s, SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; } -#endif /* _V128_INTRINSICS_C_H */ +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h index f9043fe99..6c7241ff4 100644 --- a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h +++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V128_INTRINSICS_H -#define _V128_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ #include #include "aom_dsp/simd/v64_intrinsics_x86.h" @@ -653,4 +653,4 @@ SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); } -#endif /* _V128_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h index 4b70cc57b..cb99d35b7 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V256_INTRINSICS_H -#define _V256_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ #include #include @@ -373,4 +373,4 @@ SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { return c_v256_ssd_s16_sum(s); } -#endif /* _V256_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h index d96638488..bd86ea172 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h @@ -9,9 +9,9 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V256_INTRINSICS_H -#define _V256_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ #include "aom_dsp/simd/v256_intrinsics_v128.h" -#endif /* _V256_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h index 5b412df71..a1c08e95a 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V256_INTRINSICS_C_H -#define _V256_INTRINSICS_C_H +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ #include #include @@ -950,4 +950,4 @@ SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s, SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; } -#endif /* _V256_INTRINSICS_C_H */ +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h index 60b2a1791..d5b7905ef 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V256_INTRINSICS_V128_H -#define _V256_INTRINSICS_V128_H +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ #if HAVE_NEON #include "aom_dsp/simd/v128_intrinsics_arm.h" @@ -870,4 +870,4 @@ SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]); } -#endif /* _V256_INTRINSICS_V128_H */ +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_ diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h index 05f205169..44594bc41 100644 --- a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h +++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V256_INTRINSICS_H -#define _V256_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ #if !defined(__AVX2__) @@ -747,4 +747,4 @@ SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) { #endif -#endif /* _V256_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h index 6ce53c6a9..afc55428d 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V64_INTRINSICS_H -#define _V64_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ #include #include @@ -229,4 +229,4 @@ SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { return c_v64_shr_n_s32(a, c); } -#endif /* _V64_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h index 267441b02..8f39ad6e8 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V64_INTRINSICS_H -#define _V64_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ #include @@ -677,4 +677,4 @@ SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { #endif -#endif /* _V64_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h index 8158899cb..028d68c4f 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V64_INTRINSICS_C_H -#define _V64_INTRINSICS_C_H +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ /* Note: This implements the intrinsics in plain, unoptimised C. Intended for reference, porting or debugging. */ @@ -965,4 +965,4 @@ SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) { return c_v64_shr_s32(a, c); } -#endif /* _V64_INTRINSICS_C_H */ +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_ diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h index 130052ee1..5f9a57b37 100644 --- a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h +++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _V64_INTRINSICS_H -#define _V64_INTRINSICS_H +#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ +#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ #include #if defined(__SSSE3__) @@ -488,4 +488,4 @@ SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { #define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c) #define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c) -#endif /* _V64_INTRINSICS_H */ +#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_ diff --git a/third_party/aom/aom_dsp/sse.c b/third_party/aom/aom_dsp/sse.c new file mode 100644 index 000000000..249394807 --- /dev/null +++ b/third_party/aom/aom_dsp/sse.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* Sum the difference between every corresponding element of the buffers. */ + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" + +int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = abs(a[x] - b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} + +int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int y, x; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) { + const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]); + sse += diff * diff; + } + + a += a_stride; + b += b_stride; + } + return sse; +} diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c index 6ce3d7acb..681770ba9 100644 --- a/third_party/aom/aom_dsp/ssim.c +++ b/third_party/aom/aom_dsp/ssim.c @@ -275,7 +275,7 @@ double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2, for (i = 0; i < height; i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) { for (j = 0; j < width; j += 4, ++c) { - Ssimv sv = { 0 }; + Ssimv sv = { 0, 0, 0, 0, 0, 0 }; double ssim; double ssim2; double dssim; diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h index c8a389dfe..55038f4c2 100644 --- a/third_party/aom/aom_dsp/ssim.h +++ b/third_party/aom/aom_dsp/ssim.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_SSIM_H_ -#define AOM_DSP_SSIM_H_ +#ifndef AOM_AOM_DSP_SSIM_H_ +#define AOM_AOM_DSP_SSIM_H_ #define MAX_SSIM_DB 100.0; @@ -84,4 +84,4 @@ double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source, } // extern "C" #endif -#endif // AOM_DSP_SSIM_H_ +#endif // AOM_AOM_DSP_SSIM_H_ diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h index 7deb0aea3..f98242840 100644 --- a/third_party/aom/aom_dsp/txfm_common.h +++ b/third_party/aom/aom_dsp/txfm_common.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_TXFM_COMMON_H_ -#define AOM_DSP_TXFM_COMMON_H_ +#ifndef AOM_AOM_DSP_TXFM_COMMON_H_ +#define AOM_AOM_DSP_TXFM_COMMON_H_ #include "aom_dsp/aom_dsp_common.h" #include "av1/common/enums.h" @@ -88,4 +88,4 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) { return rv; } -#endif // AOM_DSP_TXFM_COMMON_H_ +#endif // AOM_AOM_DSP_TXFM_COMMON_H_ diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c index 817ebe15d..23b715309 100644 --- a/third_party/aom/aom_dsp/variance.c +++ b/third_party/aom/aom_dsp/variance.c @@ -55,24 +55,6 @@ uint32_t aom_get_mb_ss_c(const int16_t *a) { return sum; } -uint32_t aom_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 0, b, b_stride, sse); -} - -uint32_t aom_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - return aom_sub_pixel_variance16x16_c(a, a_stride, 0, 4, b, b_stride, sse); -} - -uint32_t aom_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - uint32_t *sse) { - return aom_sub_pixel_variance16x16_c(a, a_stride, 4, 4, b, b_stride, sse); -} - static void variance(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h, uint32_t *sse, int *sum) { int i, j; @@ -302,7 +284,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride) { + int ref_stride, int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; @@ -370,7 +352,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); const InterpFilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); @@ -387,7 +369,9 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, } const InterpFilterParams *filter = - av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { for (int i = 0; i < height; i++) { @@ -398,13 +382,13 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1, - width, height); + aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL, + -1, width, height); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16, - width, height); + aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel, + 16, width, height); } else { DECLARE_ALIGNED(16, uint8_t, temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]); @@ -415,12 +399,12 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, const int intermediate_height = (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1), - ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, - width, intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), - MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, - width, height); + aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1), + ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, + width, intermediate_height); + aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), + MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, + width, height); } } @@ -429,11 +413,11 @@ void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride) { + int ref_stride, int subpel_search) { int i, j; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1); @@ -466,13 +450,14 @@ void aom_jnt_comp_avg_upsampled_pred_c( MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param) { + int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { int i, j; const int fwd_offset = jcp_param->fwd_offset; const int bck_offset = jcp_param->bck_offset; - aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { @@ -706,125 +691,125 @@ void aom_highbd_var_filter_block2d_bil_second_pass( dst, dst_stride, sse); \ } -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ + \ + return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ + \ + return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ + \ + return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ } /* All three forms of the variance are available in the same sizes. */ @@ -867,12 +852,13 @@ HIGHBD_MSE(16, 8) HIGHBD_MSE(8, 16) HIGHBD_MSE(8, 8) -void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, +void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride) { int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int tmp = pred[j] + ref[j]; @@ -887,9 +873,10 @@ void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, - uint16_t *comp_pred, int width, int height, + uint8_t *comp_pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref8, int ref_stride, int bd) { + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; @@ -902,8 +889,6 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, if (is_scaled) { // Note: This is mostly a copy from the >=8X8 case in // build_inter_predictors() function, with some small tweaks. - uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); - // Some assumptions. const int plane = 0; @@ -958,7 +943,7 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); const InterpFilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); @@ -975,13 +960,14 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, } const InterpFilterParams *filter = - av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { - const uint16_t *ref; - int i; - ref = CONVERT_TO_SHORTPTR(ref8); - for (i = 0; i < height; i++) { + const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + for (int i = 0; i < height; i++) { memcpy(comp_pred, ref, width * sizeof(*comp_pred)); comp_pred += width; ref += ref_stride; @@ -989,13 +975,13 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, kernel, 16, NULL, -1, width, height, bd); + aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, + NULL, -1, width, height, bd); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, NULL, -1, kernel, 16, width, height, bd); + aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); } else { DECLARE_ALIGNED(16, uint16_t, temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); @@ -1012,22 +998,23 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, intermediate_height, bd); aom_highbd_convolve8_vert( CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), - MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, - 16, width, height, bd); + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); } } void aom_highbd_comp_avg_upsampled_pred_c( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd) { + int ref_stride, int bd, int subpel_search) { int i, j; const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); + bd, subpel_search); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1); @@ -1037,7 +1024,7 @@ void aom_highbd_comp_avg_upsampled_pred_c( } } -void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, +void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param) { @@ -1046,6 +1033,7 @@ void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, const int bck_offset = jcp_param->bck_offset; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { @@ -1061,17 +1049,18 @@ void aom_highbd_jnt_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, void aom_highbd_jnt_comp_avg_upsampled_pred_c( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) { + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, + int subpel_search) { int i, j; const int fwd_offset = jcp_param->fwd_offset; const int bck_offset = jcp_param->bck_offset; const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); + bd, subpel_search); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { @@ -1104,21 +1093,23 @@ void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, } } -void aom_comp_mask_upsampled_pred(MACROBLOCKD *xd, const AV1_COMMON *const cm, - int mi_row, int mi_col, const MV *const mv, - uint8_t *comp_pred, const uint8_t *pred, - int width, int height, int subpel_x_q3, - int subpel_y_q3, const uint8_t *ref, - int ref_stride, const uint8_t *mask, - int mask_stride, int invert_mask) { +void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, + int mi_row, int mi_col, const MV *const mv, + uint8_t *comp_pred, const uint8_t *pred, + int width, int height, int subpel_x_q3, + int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask, + int subpel_search) { if (subpel_x_q3 | subpel_y_q3) { - aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); ref = comp_pred; ref_stride = width; } - aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, - mask_stride, invert_mask); + aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); } #define MASK_SUBPIX_VAR(W, H) \ @@ -1164,13 +1155,14 @@ MASK_SUBPIX_VAR(32, 8) MASK_SUBPIX_VAR(16, 64) MASK_SUBPIX_VAR(64, 16) -void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, +void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { if (!invert_mask) @@ -1187,16 +1179,15 @@ void aom_highbd_comp_mask_pred_c(uint16_t *comp_pred, const uint8_t *pred8, void aom_highbd_comp_mask_upsampled_pred( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, - int bd) { - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); - aom_highbd_comp_mask_pred(comp_pred, pred8, width, height, - CONVERT_TO_BYTEPTR(comp_pred), width, mask, - mask_stride, invert_mask); + bd, subpel_search); + aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width, + mask, mask_stride, invert_mask); } #define HIGHBD_MASK_SUBPIX_VAR(W, H) \ @@ -1214,7 +1205,7 @@ void aom_highbd_comp_mask_upsampled_pred( aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ - aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ invert_mask); \ \ @@ -1236,7 +1227,7 @@ void aom_highbd_comp_mask_upsampled_pred( aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ - aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ invert_mask); \ \ @@ -1258,7 +1249,7 @@ void aom_highbd_comp_mask_upsampled_pred( aom_highbd_var_filter_block2d_bil_second_pass( \ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ \ - aom_highbd_comp_mask_pred_c(temp3, second_pred, W, H, \ + aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \ invert_mask); \ \ diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h index b954470de..362da29d3 100644 --- a/third_party/aom/aom_dsp/variance.h +++ b/third_party/aom/aom_dsp/variance.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_VARIANCE_H_ -#define AOM_DSP_VARIANCE_H_ +#ifndef AOM_AOM_DSP_VARIANCE_H_ +#define AOM_AOM_DSP_VARIANCE_H_ #include "config/aom_config.h" @@ -70,18 +70,12 @@ typedef unsigned int (*aom_masked_subpixvariance_fn_t)( const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse); -void aom_comp_mask_upsampled_pred( - MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, - int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask); - void aom_highbd_comp_mask_upsampled_pred( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, - int bd); + int bd, int subpel_search); typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride, const int32_t *wsrc, @@ -133,4 +127,4 @@ uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride, } // extern "C" #endif -#endif // AOM_DSP_VARIANCE_H_ +#endif // AOM_AOM_DSP_VARIANCE_H_ diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c index 401fbdc48..5f5bf5f14 100644 --- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c +++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c @@ -22,6 +22,13 @@ filter8_1dfunction aom_filter_block1d8_h8_sse2; filter8_1dfunction aom_filter_block1d4_v8_sse2; filter8_1dfunction aom_filter_block1d4_h8_sse2; +#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2 +#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2 +#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2 +#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2 +#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2 +#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2 + filter8_1dfunction aom_filter_block1d16_v2_sse2; filter8_1dfunction aom_filter_block1d16_h2_sse2; filter8_1dfunction aom_filter_block1d8_v2_sse2; diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c index f3fe50372..94b5da171 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c @@ -74,6 +74,87 @@ static INLINE void xx_store2_mi128(const uint8_t *output_ptr, _mm256_extractf128_si256(*a, 1)); } +static void aom_filter_block1d4_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2)); + + // multiple the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); + + srcRegFilt32b1_1 = + _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = + _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256()); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 4 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); + + // multiply 4 adjacent elements with the filter and add the result + srcRegFilt1_1 = + _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); + + srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128()); + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 4 bytes + *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1); + } +} + static void aom_filter_block1d4_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -179,6 +260,100 @@ static void aom_filter_block1d4_h8_avx2( } } +static void aom_filter_block1d8_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 8 bytes + if (i > 0) { + __m128i srcReg1, srcRegFilt1_1; + __m128i srcRegFilt2, srcRegFilt3; + + srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); + + // filter the source buffer + srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); + srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = + _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters)); + srcRegFilt3 = + _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters)); + + // add and saturate the results together + srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = + _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32)); + srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits + srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128()); + + // save 8 bytes + _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1); + } +} + static void aom_filter_block1d8_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -311,6 +486,121 @@ static void aom_filter_block1d8_h8_avx2( } } +static void aom_filter_block1d16_h4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, + ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i addFilterReg32, filt2Reg, filt3Reg; + __m256i secondFilters, thirdFilters; + __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; + __m256i srcReg32b1, srcReg32b2, filtersReg32; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + src_ptr -= 3; + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + filtersReg = _mm_srai_epi16(filtersReg, 1); + // converting the 16 bit (short) to 8 bit (byte) and have the same data + // in both lanes of 128 bit register. + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + + // multiply the size of the source and destination stride by two + src_stride = src_pixels_per_line << 1; + dst_stride = output_pitch << 1; + for (i = output_height; i > 1; i -= 2) { + // load the 2 strides of source + srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // reading 2 strides of the next 16 bytes + // (part of it was being read by earlier read) + srcReg32b2 = + xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8); + + // filter the source buffer + srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); + + // add and saturate the results together + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); + + // shift by 6 bit each 16 bit + srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32); + srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32); + srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6); + srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve result + srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1); + output_ptr += dst_stride; + } + + // if the number of strides is odd. + // process only 16 bytes + if (i > 0) { + __m256i srcReg1, srcReg12; + __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1; + + srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr)); + srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94); + + // filter the source buffer + srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg); + srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg); + + // multiply 2 adjacent elements with the filter and add the result + srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters); + srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters); + + // add and saturate the results together + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3); + + // shift by 6 bit each 16 bit + srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32); + srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1); + srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8); + + // save 16 bytes + _mm_store_si128((__m128i *)output_ptr, + _mm256_castsi256_si128(srcRegFilt1_1)); + } +} + static void aom_filter_block1d16_h8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { @@ -507,6 +797,92 @@ static void aom_filter_block1d16_h8_avx2( } } +static void aom_filter_block1d8_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i resReg23_34_lo, resReg45_56_lo; + __m256i resReglo, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi64(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + static void aom_filter_block1d8_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { @@ -659,6 +1035,104 @@ static void aom_filter_block1d8_v8_avx2( } } +static void aom_filter_block1d16_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi; + __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi; + __m256i resReglo, resReghi, resReg; + __m256i secondFilters, thirdFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + // duplicate only the second 16 bits (third and forth byte) + // across 256 bit register + secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); + // duplicate only the third 16 bits (fifth and sixth byte) + // across 256 bit register + thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters); + resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters); + + // add and saturate the results together + resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters); + resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters); + + // add and saturate the results together + resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReghi = _mm256_adds_epi16(resReghi, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + resReghi = _mm256_srai_epi16(resReghi, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReghi); + + src_ptr += src_stride; + + xx_store2_mi128(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg23_34_hi = srcReg45_56_hi; + srcReg4x = srcReg6x; + } +} + static void aom_filter_block1d16_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { @@ -854,6 +1328,88 @@ static void aom_filter_block1d16_v8_avx2( } } +static void aom_filter_block1d4_v4_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m128i filtersReg; + __m256i filtersReg32, addFilterReg32; + __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56; + __m256i srcReg23_34_lo, srcReg45_56_lo; + __m256i srcReg2345_3456_lo; + __m256i resReglo, resReg; + __m256i firstFilters; + unsigned int i; + ptrdiff_t src_stride, dst_stride; + + addFilterReg32 = _mm256_set1_epi16(32); + filtersReg = _mm_loadu_si128((const __m128i *)filter); + // converting the 16 bit (short) to 8 bit (byte) and have the + // same data in both lanes of 128 bit register. + filtersReg = _mm_srai_epi16(filtersReg, 1); + filtersReg = _mm_packs_epi16(filtersReg, filtersReg); + // have the same data in both lanes of a 256 bit register + filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); + + firstFilters = + _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u)); + + // multiple the size of the source and destination stride by two + src_stride = src_pitch << 1; + dst_stride = out_pitch << 1; + + srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2); + srcReg4x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4))); + + // have consecutive loads on the same 256 register + srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21); + + srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34); + + for (i = output_height; i > 1; i -= 2) { + // load the last 2 loads of 16 bytes and have every two + // consecutive loads in the same 256 bit register + srcReg5x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5))); + srcReg45 = + _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1); + + srcReg6x = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6))); + srcReg56 = + _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1); + + // merge every two consecutive registers + srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56); + + srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); + + // multiply 2 adjacent elements with the filter and add the result + resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters); + + resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256()); + + // shift by 6 bit each 16 bit + resReglo = _mm256_adds_epi16(resReglo, addFilterReg32); + resReglo = _mm256_srai_epi16(resReglo, 6); + + // shrink to 8 bit each 16 bits, the first lane contain the first + // convolve result and the second lane contain the second convolve + // result + resReg = _mm256_packus_epi16(resReglo, resReglo); + + src_ptr += src_stride; + + xx_storeu2_epi32(output_ptr, out_pitch, &resReg); + + output_ptr += dst_stride; + + // save part of the registers for next strides + srcReg23_34_lo = srcReg45_56_lo; + srcReg4x = srcReg6x; + } +} + #if HAVE_AVX2 && HAVE_SSSE3 filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d16_v2_ssse3; diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c index 6bcb4a512..325a21b76 100644 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c +++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c @@ -287,6 +287,13 @@ filter8_1dfunction aom_filter_block1d8_h8_ssse3; filter8_1dfunction aom_filter_block1d4_v8_ssse3; filter8_1dfunction aom_filter_block1d4_h8_ssse3; +#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3 +#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3 +#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3 +#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3 +#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3 +#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3 + filter8_1dfunction aom_filter_block1d16_v2_ssse3; filter8_1dfunction aom_filter_block1d16_h2_ssse3; filter8_1dfunction aom_filter_block1d8_v2_ssse3; diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c new file mode 100644 index 000000000..67fb4d32b --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c @@ -0,0 +1,900 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include // SSE4.1 +#include // AVX2 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w16_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval, + int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s1_0 = yy_loadu_256(src1); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + __m256i res = _mm256_packus_epi16(res0, res0); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res)); +} + +static INLINE void blend_a64_d16_mask_w32_avx2( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset, + const __m256i *v_maxval, int shift) { + const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0); + const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1); + const __m256i s0_0 = yy_loadu_256(src0); + const __m256i s0_1 = yy_loadu_256(src0 + 16); + const __m256i s1_0 = yy_loadu_256(src1); + const __m256i s1_1 = yy_loadu_256(src1 + 16); + __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0), + _mm256_unpacklo_epi16(*m0, max_minus_m0)); + __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0), + _mm256_unpackhi_epi16(*m0, max_minus_m0)); + __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1), + _mm256_unpacklo_epi16(*m1, max_minus_m1)); + __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1), + _mm256_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = + _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = + _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift); + const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi); + const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi); + __m256i res = _mm256_packus_epi16(res0, res1); + res = _mm256_permute4x64_epi64(res, 0xd8); + _mm256_storeu_si256((__m256i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m = xx_loadu_128(mask); + const __m256i m0 = _mm256_cvtepu8_epi16(m); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m = yy_loadu_256(mask + j); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m)); + const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m256i m_i00 = yy_loadu_256(mask); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + + blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i two_w = _mm256_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j); + const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32); + + const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10); + const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11); + const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b); + const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b); + const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2); + const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i one_b = _mm256_set1_epi8(1); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + 2 * j); + const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32); + const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b); + const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b); + const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros); + const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(m_ac); + + blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m256i *round_offset, int shift) { + const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m256i zeros = _mm256_setzero_si256(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 32) { + const __m256i m_i00 = yy_loadu_256(mask + j); + const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j); + + const __m256i m_ac = + _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros); + const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac)); + const __m256i m1 = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1)); + + blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +void aom_lowbd_blend_a64_d16_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, + ConvolveParams *conv_params) { + const int bd = 8; + const int round_bits = + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; + assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); + + assert(h >= 4); + assert(w >= 4); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + const __m128i v_round_offset = _mm_set1_epi32(round_offset); + const __m256i y_round_offset = _mm256_set1_epi32(round_offset); + + if (subw == 0 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 1) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else if (subw == 1 && subh == 0) { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } else { + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 16: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &y_round_offset, shift); + break; + } + } +} + +static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0)); + const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1)); + const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8); + const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w); + const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8); + return v_res; +} + +static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1, + const __m256i *v_m0_b, + const __m256i *v_m1_b, + const int32_t bits) { + const __m256i v_s0_b = yy_loadu_256(src0); + const __m256i v_s1_b = yy_loadu_256(src1); + + const __m256i v_p0_w = + _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b), + _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m256i v_p1_w = + _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b), + _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits); + const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits); + const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + +static INLINE void blend_a64_mask_sx_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m256i v_ral_b = yy_loadu_256(mask); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + + const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xFF); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ral_b = yy_loadu_256(mask + 2 * c); + const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c); + const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32); + const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b); + const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b); + const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b); + const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b); + const __m256i v_rvsbl_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b); + const __m256i v_rvsbh_w = + _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b); + const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w); + const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w); + + const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2); + const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2); + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); + const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sx_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m256i v_zmask_b = _mm256_set1_epi16(0xff); + do { + const __m256i v_rl_b = yy_loadu_256(mask); + const __m256i v_al_b = + _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1)); + + const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b); + const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256()); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b, + AOM_BLEND_A64_ROUND_BITS); + + xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b)); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle); + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_r0_b = yy_loadu_256(mask + 2 * c); + const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32); + const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m256i v_al_b = + _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8)); + const __m256i v_ah_b = + _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8)); + + const __m256i v_m0_b = + _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sx_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_r_b = xx_loadl_64(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_r_b = xx_loadu_128(mask); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + break; + } +} + +static INLINE void blend_a64_mask_sy_w16_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + const __m128i v_ra_b = xx_loadu_128(mask); + const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + + const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_ra_b = yy_loadu_256(mask + c); + const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride); + const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_sy_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + switch (w) { + case 4: + do { + const __m128i v_ra_b = xx_loadl_32(mask); + const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_ra_b = xx_loadl_64(mask); + const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += 2 * mask_stride; + } while (--h); + break; + case 16: + blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, h); + break; + default: + blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +static INLINE void blend_a64_mask_w32n_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + do { + int c; + for (c = 0; c < w; c += 32) { + const __m256i v_m0_b = yy_loadu_256(mask + c); + const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b); + + const __m256i v_res_b = blend_32_u8_avx2( + src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS); + + yy_storeu_256(dst + c, v_res_b); + } + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); +} + +static INLINE void blend_a64_mask_avx2( + uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, + uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, int h) { + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + switch (w) { + case 4: + do { + const __m128i v_m0_b = xx_loadl_32(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_32(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 8: + do { + const __m128i v_m0_b = xx_loadl_64(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storel_64(dst, v_res_b); + + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + case 16: + do { + const __m128i v_m0_b = xx_loadu_128(mask); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); + + xx_storeu_128(dst, v_res_b); + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + mask += mask_stride; + } while (--h); + break; + default: + blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } +} + +void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride, + const uint8_t *src0, uint32_t src0_stride, + const uint8_t *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int w, + int h, int subx, int suby) { + assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); + assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); + + assert(h >= 1); + assert(w >= 1); + assert(IS_POWER_OF_TWO(h)); + assert(IS_POWER_OF_TWO(w)); + + if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) + aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h, subx, suby); + } else { + if (subx & suby) { + blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (subx) { + blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else if (suby) { + blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1, + src1_stride, mask, mask_stride, w, h); + } else { + blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride, + mask, mask_stride, w, h); + } + } +} diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c index 49c20b467..9d6b4c2f7 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c @@ -20,6 +20,7 @@ #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/blend_sse4.h" +#include "aom_dsp/x86/blend_mask_sse4.h" #include "config/aom_dsp_rtcd.h" @@ -32,19 +33,13 @@ static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_m0_b = xx_loadl_32(mask); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; @@ -59,19 +54,13 @@ static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_m0_b = xx_loadl_64(mask); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); - + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); dst += dst_stride; @@ -85,23 +74,17 @@ static void blend_a64_mask_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { - const __m128i v_m0l_b = xx_loadl_64(mask + c); - const __m128i v_m0h_b = xx_loadl_64(mask + c + 8); - const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b); - const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + const __m128i v_m0_b = xx_loadu_128(mask + c); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -120,23 +103,20 @@ static void blend_a64_mask_sx_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_r_b = xx_loadl_64(mask); - const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); - - const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); dst += dst_stride; @@ -150,22 +130,20 @@ static void blend_a64_mask_sx_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_r_b = xx_loadu_128(mask); - const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); - - const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -180,28 +158,24 @@ static void blend_a64_mask_sx_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { - const __m128i v_rl_b = xx_loadu_128(mask + 2 * c); - const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16); - const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1)); - const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1)); - - const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b); - const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_r0_b = xx_loadu_128(mask + 2 * c); + const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16); + const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b); + const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b); + const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b); + const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -220,21 +194,18 @@ static void blend_a64_mask_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); + do { const __m128i v_ra_b = xx_loadl_32(mask); const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); @@ -249,21 +220,16 @@ static void blend_a64_mask_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - (void)w; + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); - - const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -278,26 +244,18 @@ static void blend_a64_mask_sy_w16n_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zero = _mm_setzero_si128(); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { const __m128i v_ra_b = xx_loadu_128(mask + c); const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); - const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b); - const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); - - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -316,27 +274,24 @@ static void blend_a64_mask_sx_sy_w4_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); (void)w; do { const __m128i v_ra_b = xx_loadl_64(mask); const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = - _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); - const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); - + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_32(dst, v_res_b); @@ -351,27 +306,25 @@ static void blend_a64_mask_sx_sy_w8_sse4_1( uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h) { - const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, - 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); (void)w; do { const __m128i v_ra_b = xx_loadu_128(mask); const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); - const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); - const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); - const __m128i v_rvsb_w = - _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); - const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); + const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); + const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); + const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); + const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); + const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); - const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); - - const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); + const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); xx_storel_64(dst, v_res_b); @@ -388,8 +341,8 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( const uint8_t *mask, uint32_t mask_stride, int w, int h) { const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff); - const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - + const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); + const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); do { int c; for (c = 0; c < w; c += 16) { @@ -410,14 +363,11 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1( const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); - const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w); - const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w); - - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0l_w, v_m1l_w); - const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0h_w, v_m1h_w); + const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w); + const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); - const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); + const __m128i v_res_b = + blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); xx_storeu_128(dst + c, v_res_b); } @@ -921,24 +871,140 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, } } -static INLINE void blend_a64_d16_mask(uint8_t *dst, const CONV_BUF_TYPE *src0, - const CONV_BUF_TYPE *src1, - const __m128i *m, - const __m128i *v_round_offset, - const __m128i *v_maxval, int round_bits) { - const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); - const __m128i s0 = xx_loadl_64(src0); - const __m128i s1 = xx_loadl_64(src1); - const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); - const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); - const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); - const __m128i res_b = _mm_srli_epi32(res_a, AOM_BLEND_A64_ROUND_BITS); - const __m128i res_c = _mm_sub_epi32(res_b, *v_round_offset); - const __m128i res_d = xx_roundn_epi32(res_c, round_bits); - const __m128i res_e = _mm_packs_epi32(res_d, res_d); - const __m128i res = _mm_packus_epi16(res_e, res_e); - - xx_storel_32(dst, res); +static INLINE void blend_a64_d16_mask_w16_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset, + const __m128i *v_maxval, int shift) { + const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0); + const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1); + const __m128i s0_0 = xx_loadu_128(src0); + const __m128i s0_1 = xx_loadu_128(src0 + 8); + const __m128i s1_0 = xx_loadu_128(src1); + const __m128i s1_1 = xx_loadu_128(src1 + 8); + __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0), + _mm_unpacklo_epi16(*m0, max_minus_m0)); + __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0), + _mm_unpackhi_epi16(*m0, max_minus_m0)); + __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1), + _mm_unpacklo_epi16(*m1, max_minus_m1)); + __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1), + _mm_unpackhi_epi16(*m1, max_minus_m1)); + res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift); + res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift); + res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift); + res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift); + const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi); + const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi); + const __m128i res = _mm_packus_epi16(res0, res1); + + _mm_storeu_si128((__m128i *)(dst), res); +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m = xx_loadu_128(mask + j); + const __m128i m0 = _mm_cvtepu8_epi16(m); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); + const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); + + const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); + const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); + const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); + const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); + const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); + const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + 2 * j); + const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); + const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b); + const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b); + const __m128i m0 = _mm_avg_epu16(m0_ac, zeros); + const __m128i m1 = _mm_avg_epu16(m1_ac, zeros); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, int w, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const __m128i m_i00 = xx_loadu_128(mask + j); + const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); + + const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); + const __m128i m0 = _mm_cvtepu8_epi16(m_ac); + const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8)); + + blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, + round_offset, &v_maxval, shift); + } + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } } void aom_lowbd_blend_a64_d16_mask_sse4_1( @@ -947,12 +1013,15 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1( const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params) { const int bd = 8; - const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - const int round_offset = (1 << (offset_bits - conv_params->round_1)) + - (1 << (offset_bits - conv_params->round_1 - 1)); const int round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; + const int round_offset = + ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - + (1 << (round_bits - 1))) + << AOM_BLEND_A64_ROUND_BITS; + + const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); @@ -961,69 +1030,80 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1( assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); - const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); - const __m128i v_ro_a = xx_loadl_32(&round_offset); - const __m128i v_round_offset = _mm_shuffle_epi32(v_ro_a, 0); - const __m128i one_w = _mm_set1_epi16(1); - const __m128i one_b = _mm_set1_epi8(1); - const __m128i two_w = _mm_set1_epi16(2); + const __m128i v_round_offset = _mm_set1_epi32(round_offset); if (subw == 0 && subh == 0) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m0 = xx_loadl_32(&mask[i * mask_stride + j]); - const __m128i m = _mm_cvtepu8_epi16(m0); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } + } else if (subw == 1 && subh == 1) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m_i0 = - xx_loadl_64(&mask[(2 * i) * mask_stride + (2 * j)]); - const __m128i m_i1 = - xx_loadl_64(&mask[(2 * i + 1) * mask_stride + (2 * j)]); - const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); - const __m128i m_bd = _mm_maddubs_epi16(m_i1, one_b); - const __m128i m_acbd = _mm_add_epi16(m_ac, m_bd); - const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); - const __m128i m = _mm_srli_epi16(m_acbd_2, 2); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } } else if (subw == 1 && subh == 0) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m_i0 = xx_loadl_64(&mask[i * mask_stride + (2 * j)]); - const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); - const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w); - const __m128i m = _mm_srli_epi16(m_ac_1, 1); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } } else { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; j += 4) { - const __m128i m_i0 = xx_loadl_64(&mask[(2 * i) * mask_stride + j]); - const __m128i m_i1 = xx_loadl_64(&mask[(2 * i + 1) * mask_stride + j]); - const __m128i m_i01 = _mm_unpacklo_epi8(m_i0, m_i1); - const __m128i m_ac = _mm_maddubs_epi16(m_i01, one_b); - const __m128i m_ac_1 = _mm_add_epi16(m_ac, one_w); - const __m128i m = _mm_srli_epi16(m_ac_1, 1); - - blend_a64_d16_mask(&dst[i * dst_stride + j], &src0[i * src0_stride + j], - &src1[i * src1_stride + j], &m, &v_round_offset, - &v_maxval, round_bits); - } + switch (w) { + case 4: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + case 8: + aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, &v_round_offset, shift); + break; + default: + lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( + dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, + mask_stride, h, w, &v_round_offset, shift); + break; } } } diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c index 59506bdfe..064910232 100644 --- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c +++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c @@ -39,7 +39,7 @@ static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w); + const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); @@ -64,7 +64,7 @@ static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); - const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w); + const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); @@ -90,9 +90,9 @@ static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, const __m128i v_m0_w = _mm_set1_epi16(*mask); const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); for (c = 0; c < w; c += 16) { - const __m128i v_resl_w = blend_8(src0 + c, src1 + c, v_m0_w, v_m1_w); + const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w); const __m128i v_resh_w = - blend_8(src0 + c + 8, src1 + c + 8, v_m0_w, v_m1_w); + blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w); const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h new file mode 100644 index 000000000..c071fdcfc --- /dev/null +++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ +#include // SSE4.1 + +#include + +#include "aom/aom_integer.h" +#include "aom_ports/mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/blend.h" + +#include "aom_dsp/x86/synonyms.h" + +#include "config/aom_dsp_rtcd.h" + +static INLINE void blend_a64_d16_mask_w4_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadl_64(src0); + const __m128i s1 = xx_loadl_64(src1); + const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); + const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); + const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); + const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); + const __m128i res_d = _mm_srai_epi32(res_c, shift); + const __m128i res_e = _mm_packs_epi32(res_d, res_d); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + xx_storel_32(dst, res); +} + +static INLINE void blend_a64_d16_mask_w8_sse41( + uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, + const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, + int shift) { + const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); + const __m128i s0 = xx_loadu_128(src0); + const __m128i s1 = xx_loadu_128(src1); + __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), + _mm_unpacklo_epi16(*m, max_minus_m)); + __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), + _mm_unpackhi_epi16(*m, max_minus_m)); + res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); + res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); + const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); + const __m128i res = _mm_packus_epi16(res_e, res_e); + + _mm_storel_epi64((__m128i *)(dst), res); +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_32(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + for (int i = 0; i < h; ++i) { + const __m128i m0 = xx_loadl_64(mask); + const __m128i m = _mm_cvtepu8_epi16(m0); + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i two_w = _mm_set1_epi16(2); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_i1 = xx_loadu_128(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); + const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); + const __m128i m = _mm_srli_epi16(m_acbd_2, 2); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i one_b = _mm_set1_epi8(1); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadu_128(mask); + const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); + const __m128i m = _mm_avg_epu16(m_ac, zeros); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} + +static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( + uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, + uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, + const uint8_t *mask, uint32_t mask_stride, int h, + const __m128i *round_offset, int shift) { + const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); + const __m128i zeros = _mm_setzero_si128(); + for (int i = 0; i < h; ++i) { + const __m128i m_i0 = xx_loadl_64(mask); + const __m128i m_i1 = xx_loadl_64(mask + mask_stride); + const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); + const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); + + blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, + shift); + mask += mask_stride << 1; + dst += dst_stride; + src0 += src0_stride; + src1 += src1_stride; + } +} +#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h index 4880438bc..8d9b32510 100644 --- a/third_party/aom/aom_dsp/x86/blend_sse4.h +++ b/third_party/aom/aom_dsp/x86/blend_sse4.h @@ -9,42 +9,44 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_BLEND_SSE4_H_ -#define AOM_DSP_X86_BLEND_SSE4_H_ +#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_ +#define AOM_AOM_DSP_X86_BLEND_SSE4_H_ #include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" +static const uint8_t g_blend_a64_mask_shuffle[32] = { + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, + 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, +}; ////////////////////////////////////////////////////////////////////////////// // Common kernels ////////////////////////////////////////////////////////////////////////////// static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i *v_m0_w, const __m128i *v_m1_w) { const __m128i v_s0_b = xx_loadl_32(src0); const __m128i v_s1_b = xx_loadl_32(src1); const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); - + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); - const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS); return v_res_w; } static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, - const __m128i v_m0_w, const __m128i v_m1_w) { + const __m128i *v_m0_w, const __m128i *v_m1_w) { const __m128i v_s0_b = xx_loadl_64(src0); const __m128i v_s1_b = xx_loadl_64(src1); const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b); const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b); - const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w); - const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w); + const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w); + const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w); const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w); @@ -53,6 +55,51 @@ static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1, return v_res_w; } +static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_32(src0); + const __m128i v_s1_b = xx_loadl_32(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadl_64(src0); + const __m128i v_s1_b = xx_loadl_64(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w); + return v_res; +} + +static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1, + const __m128i *v_m0_b, const __m128i *v_m1_b, + const __m128i *rounding) { + const __m128i v_s0_b = xx_loadu_128(src0); + const __m128i v_s1_b = xx_loadu_128(src1); + + const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b), + _mm_unpacklo_epi8(*v_m0_b, *v_m1_b)); + const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b), + _mm_unpackhi_epi8(*v_m0_b, *v_m1_b)); + + const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding); + const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding); + const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w); + return v_res; +} + typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1, const __m128i v_m0_w, const __m128i v_m1_w); @@ -141,4 +188,4 @@ static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1, return v_res_w; } -#endif // AOM_DSP_X86_BLEND_SSE4_H_ +#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h index 3f46420dd..96fe4ebb6 100644 --- a/third_party/aom/aom_dsp/x86/common_avx2.h +++ b/third_party/aom/aom_dsp/x86/common_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_COMMON_AVX2_H -#define AOM_DSP_X86_COMMON_AVX2_H +#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_COMMON_AVX2_H_ #include @@ -144,4 +144,4 @@ static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) { out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20); out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31); } -#endif +#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h index 36fb1963a..3e19682cd 100644 --- a/third_party/aom/aom_dsp/x86/convolve.h +++ b/third_party/aom/aom_dsp/x86/convolve.h @@ -8,8 +8,8 @@ * Media Patent License 1.0 was not distributed with this source code in the * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_CONVOLVE_H_ -#define AOM_DSP_X86_CONVOLVE_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_H_ #include @@ -17,7 +17,6 @@ #include "aom/aom_integer.h" #include "aom_ports/mem.h" -#include "aom_dsp/aom_convolve.h" typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, @@ -34,7 +33,30 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch, (void)y_step_q4; \ assert((-128 <= filter[3]) && (filter[3] <= 127)); \ assert(step_q4 == 16); \ - if (filter[0] | filter[1] | filter[2]) { \ + if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \ + (filter[2] | filter[5])) { \ + while (w >= 16) { \ + aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \ + dst_stride, h, filter); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ + } else if (filter[0] | filter[1] | filter[2]) { \ while (w >= 16) { \ aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \ dst_stride, h, filter); \ @@ -153,4 +175,4 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, } \ } -#endif // AOM_DSP_X86_CONVOLVE_H_ +#endif // AOM_AOM_DSP_X86_CONVOLVE_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h index 72fabd236..30253f65c 100644 --- a/third_party/aom/aom_dsp/x86/convolve_avx2.h +++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_CONVOLVE_AVX2_H_ -#define AOM_DSP_X86_CONVOLVE_AVX2_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ // filters for 16 DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = { @@ -29,6 +29,11 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = { 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, }; +DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, + 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, +}; + static INLINE void prepare_coeffs_lowbd( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { @@ -191,4 +196,4 @@ static INLINE __m256i highbd_convolve_rounding( return res_round; } -#endif +#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h index e80c5872f..707bd2d78 100644 --- a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h +++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ -#define _AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ // Note: // This header file should be put below any x86 intrinsics head file @@ -28,4 +28,4 @@ static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res, _mm_store_si128((__m128i *)dst, d); } -#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ +#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h index 399df5d6d..445d04b10 100644 --- a/third_party/aom/aom_dsp/x86/convolve_sse2.h +++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_CONVOLVE_SSE2_H_ -#define AOM_DSP_X86_CONVOLVE_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ // Note: // This header file should be put below any x86 intrinsics head file @@ -118,4 +118,4 @@ static INLINE __m128i highbd_convolve_rounding_sse2( return res_round; } -#endif +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h index d48c25667..6b8388d84 100644 --- a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h +++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_ -#define _AOM_DSP_X86_CONVOLVE_SSE4_1_INTRIN_H_ +#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ +#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ // Note: // This header file should be put below any x86 intrinsics head file @@ -50,4 +50,4 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, return res; } -#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_ +#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_ diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h index 12ccf7f26..260d8dd58 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_ -#define AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ #ifdef __cplusplus extern "C" { @@ -152,4 +152,4 @@ static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { } // extern "C" #endif -#endif // AOM_DSP_X86_FWD_TXFM_SSE2_H_ +#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm deleted file mode 100644 index 99f17ebdf..000000000 --- a/third_party/aom/aom_dsp/x86/halfpix_variance_impl_sse2.asm +++ /dev/null @@ -1,351 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -SECTION .text - -;void aom_half_horiz_vert_variance16x_h_sse2(unsigned char *ref, -; int ref_stride, -; unsigned char *src, -; int src_stride, -; unsigned int height, -; int *sum, -; unsigned int *sumsquared) -global sym(aom_half_horiz_vert_variance16x_h_sse2) PRIVATE -sym(aom_half_horiz_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref - - mov rdi, arg(2) ;src - movsxd rcx, dword ptr arg(4) ;height - movsxd rax, dword ptr arg(1) ;ref_stride - movsxd rdx, dword ptr arg(3) ;src_stride - - pxor xmm0, xmm0 ; - - movdqu xmm5, XMMWORD PTR [rsi] - movdqu xmm3, XMMWORD PTR [rsi+1] - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 - - lea rsi, [rsi + rax] - -aom_half_horiz_vert_variance16x_h_1: - movdqu xmm1, XMMWORD PTR [rsi] ; - movdqu xmm2, XMMWORD PTR [rsi+1] ; - pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 - - pavgb xmm5, xmm1 ; xmm = vertical average of the above - - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm4, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - psubw xmm5, xmm3 ; xmm5 -= xmm3 - - movq xmm3, QWORD PTR [rdi+8] - punpcklbw xmm3, xmm0 - psubw xmm4, xmm3 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm1 ; save xmm1 for use on the next row - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz aom_half_horiz_vert_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_half_vert_variance16x_h_sse2(unsigned char *ref, -; int ref_stride, -; unsigned char *src, -; int src_stride, -; unsigned int height, -; int *sum, -; unsigned int *sumsquared) -global sym(aom_half_vert_variance16x_h_sse2) PRIVATE -sym(aom_half_vert_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref - - mov rdi, arg(2) ;src - movsxd rcx, dword ptr arg(4) ;height - movsxd rax, dword ptr arg(1) ;ref_stride - movsxd rdx, dword ptr arg(3) ;src_stride - - movdqu xmm5, XMMWORD PTR [rsi] - lea rsi, [rsi + rax ] - pxor xmm0, xmm0 - -aom_half_vert_variance16x_h_1: - movdqu xmm3, XMMWORD PTR [rsi] - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm4, xmm5 - punpcklbw xmm5, xmm0 - punpckhbw xmm4, xmm0 - - movq xmm2, QWORD PTR [rdi] - punpcklbw xmm2, xmm0 - psubw xmm5, xmm2 - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - psubw xmm4, xmm2 - - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm4 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm4, xmm4 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm4 - - movdqa xmm5, xmm3 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 - jnz aom_half_vert_variance16x_h_1 - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void aom_half_horiz_variance16x_h_sse2(unsigned char *ref, -; int ref_stride -; unsigned char *src, -; int src_stride, -; unsigned int height, -; int *sum, -; unsigned int *sumsquared) -global sym(aom_half_horiz_variance16x_h_sse2) PRIVATE -sym(aom_half_horiz_variance16x_h_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 ; error accumulator - pxor xmm7, xmm7 ; sse eaccumulator - mov rsi, arg(0) ;ref - - mov rdi, arg(2) ;src - movsxd rcx, dword ptr arg(4) ;height - movsxd rax, dword ptr arg(1) ;ref_stride - movsxd rdx, dword ptr arg(3) ;src_stride - - pxor xmm0, xmm0 ; - -aom_half_horiz_variance16x_h_1: - movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 - movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 - - pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) - movdqa xmm1, xmm5 - punpcklbw xmm5, xmm0 ; xmm5 = words of above - punpckhbw xmm1, xmm0 - - movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 - punpcklbw xmm3, xmm0 ; xmm3 = words of above - movq xmm2, QWORD PTR [rdi+8] - punpcklbw xmm2, xmm0 - - psubw xmm5, xmm3 ; xmm5 -= xmm3 - psubw xmm1, xmm2 - paddw xmm6, xmm5 ; xmm6 += accumulated column differences - paddw xmm6, xmm1 - pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 - pmaddwd xmm1, xmm1 - paddd xmm7, xmm5 ; xmm7 += accumulated square column differences - paddd xmm7, xmm1 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - - sub rcx, 1 ; - jnz aom_half_horiz_variance16x_h_1 ; - - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(5) ;[Sum] - mov rdi, arg(6) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -aom_bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 diff --git a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c b/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c deleted file mode 100644 index 2a018c1cf..000000000 --- a/third_party/aom/aom_dsp/x86/halfpix_variance_sse2.c +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include - -#include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" - -#include "aom/aom_integer.h" - -void aom_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref, - int ref_stride, - const unsigned char *src, - int src_stride, unsigned int height, - int *sum, unsigned int *sumsquared); -void aom_half_horiz_variance16x_h_sse2(const unsigned char *ref, int ref_stride, - const unsigned char *src, int src_stride, - unsigned int height, int *sum, - unsigned int *sumsquared); -void aom_half_vert_variance16x_h_sse2(const unsigned char *ref, int ref_stride, - const unsigned char *src, int src_stride, - unsigned int height, int *sum, - unsigned int *sumsquared); - -uint32_t aom_variance_halfpixvar16x16_h_sse2(const unsigned char *src, - int src_stride, - const unsigned char *dst, - int dst_stride, uint32_t *sse) { - int xsum0; - unsigned int xxsum0; - - aom_half_horiz_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - assert(xsum0 <= 255 * 16 * 16); - assert(xsum0 >= -255 * 16 * 16); - return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); -} - -uint32_t aom_variance_halfpixvar16x16_v_sse2(const unsigned char *src, - int src_stride, - const unsigned char *dst, - int dst_stride, uint32_t *sse) { - int xsum0; - unsigned int xxsum0; - aom_half_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, &xsum0, - &xxsum0); - - *sse = xxsum0; - assert(xsum0 <= 255 * 16 * 16); - assert(xsum0 >= -255 * 16 * 16); - return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); -} - -uint32_t aom_variance_halfpixvar16x16_hv_sse2(const unsigned char *src, - int src_stride, - const unsigned char *dst, - int dst_stride, uint32_t *sse) { - int xsum0; - unsigned int xxsum0; - - aom_half_horiz_vert_variance16x_h_sse2(src, src_stride, dst, dst_stride, 16, - &xsum0, &xxsum0); - - *sse = xxsum0; - assert(xsum0 <= 255 * 16 * 16); - assert(xsum0 >= -255 * 16 * 16); - return (xxsum0 - ((uint32_t)((int64_t)xsum0 * xsum0) >> 8)); -} diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c index 83e0098ba..097e0778f 100644 --- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -327,6 +327,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt, const unsigned char *lt, const unsigned char *thr, int bd) { int i; + const __m128i zero = _mm_setzero_si128(); __m128i blimit, limit, thresh; __m128i t80; get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80); @@ -355,13 +356,18 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat2 = _mm_unpacklo_epi64(flat2, flat2); // flat and wide flat calculations - __m128i flat_p[3], flat_q[3], flat_pq[3]; - __m128i flat2_p[6], flat2_q[6]; - __m128i flat2_pq[6]; - { - __m128i work0; + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3], flat_pq[3]; + __m128i flat2_p[6], flat2_q[6]; + __m128i flat2_pq[6]; + __m128i sum_p6, sum_p3; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); + + __m128i work0, work0_0, work0_1, sum_p_0; __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3])); __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1])); sum_p = _mm_add_epi16(sum_p, sum_lp); @@ -369,30 +375,23 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( __m128i sum_lq = _mm_srli_si128(sum_lp, 8); __m128i sum_q = _mm_srli_si128(sum_p, 8); - sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - work0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); - flat2_p[0] = _mm_add_epi16(sum_p, _mm_add_epi16(work0, q[0])); - flat2_q[0] = - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_srli_si128(work0, 8), p[0])); - - flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])); + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0])); flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])); - __m128i sum_p6, sum_p3; sum_p6 = _mm_add_epi16(pq[6], pq[6]); sum_p3 = _mm_add_epi16(pq[3], pq[3]); - sum_q = _mm_sub_epi16(sum_p, p[5]); - sum_p = _mm_sub_epi16(sum_p, q[5]); + sum_q = _mm_sub_epi16(sum_p_0, pq[5]); + sum_p = _mm_sub_epi16(sum_p_0, q[5]); - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); - flat2_p[1] = _mm_add_epi16(sum_p, work0); - flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]); + work0_1 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0]))); - sum_lq = _mm_sub_epi16(sum_lp, p[2]); + sum_lq = _mm_sub_epi16(sum_lp, pq[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); work0 = _mm_add_epi16(sum_p3, pq[1]); @@ -402,21 +401,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); - flat2_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); - flat2_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); - - sum_p = _mm_sub_epi16(sum_p, q[4]); - sum_q = _mm_sub_epi16(sum_q, p[4]); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); - flat2_p[2] = _mm_add_epi16(sum_p, work0); - flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); - sum_lp = _mm_sub_epi16(sum_lp, q[1]); - sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_lq = _mm_sub_epi16(sum_lq, pq[1]); sum_p3 = _mm_add_epi16(sum_p3, pq[3]); work0 = _mm_add_epi16(sum_p3, pq[2]); @@ -425,54 +411,88 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2( flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[3]); - sum_q = _mm_sub_epi16(sum_q, p[3]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); - flat2_p[3] = _mm_add_epi16(sum_p, work0); - flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[3] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[2]); - sum_q = _mm_sub_epi16(sum_q, p[2]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); - flat2_p[4] = _mm_add_epi16(sum_p, work0); - flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[4] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); - - sum_p6 = _mm_add_epi16(sum_p6, pq[6]); - sum_p = _mm_sub_epi16(sum_p, q[1]); - sum_q = _mm_sub_epi16(sum_q, p[1]); - - work0 = _mm_add_epi16(sum_p6, - _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); - flat2_p[5] = _mm_add_epi16(sum_p, work0); - flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); - flat2_pq[5] = _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); - } - - // highbd_filter8 - pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); - pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); - - for (i = 0; i < 3; i++) { - pq[i] = _mm_andnot_si128(flat, pq[i]); - flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); - pq[i] = _mm_or_si128(pq[i], flat_pq[i]); - } - - // highbd_filter16 - for (i = 5; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - pq[i] = _mm_andnot_si128(flat2, pq[i]); - flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); - // get values for when (flat2 && flat && mask) - pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0])); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, pq[3]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, pq[2]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + + sum_p6 = _mm_add_epi16(sum_p6, pq[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, pq[1]); + + work0 = _mm_add_epi16(sum_p6, + _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + } // flat2 + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // highbd_filter8 + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); + + for (i = 0; i < 3; i++) { + pq[i] = _mm_andnot_si128(flat, pq[i]); + flat_pq[i] = _mm_and_si128(flat, flat_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat_pq[i]); + } + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + if (flat2_mask) { + for (i = 0; i < 6; i++) { + pq[i] = _mm_andnot_si128(flat2, pq[i]); + flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]); + pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values + } + } + } else { + pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1); + pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1); } } @@ -500,6 +520,8 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1, const uint8_t *thr1, int bd) { __m128i blimit, limit, thresh, t80; + const __m128i zero = _mm_setzero_si128(); + get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh, &t80); __m128i mask; @@ -512,27 +534,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( __m128i ps[2], qs[2]; highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80); // flat and wide flat calculations - __m128i flat_p[3], flat_q[3]; - __m128i flat2_p[6], flat2_q[6]; - { + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); - __m128i sum_p = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); + __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3])); __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3])); __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1])); - sum_p = _mm_add_epi16(sum_p, sum_lp); + sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp); __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1])); sum_q = _mm_add_epi16(sum_q, sum_lq); - sum_p = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q)); sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); - flat2_p[0] = _mm_srli_epi16( - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), - _mm_add_epi16(p[1], q[0]))), - 4); - flat2_q[0] = _mm_srli_epi16( - _mm_add_epi16(sum_p, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), - _mm_add_epi16(p[0], q[1]))), - 4); flat_p[0] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3); flat_q[0] = @@ -541,117 +558,160 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2( __m128i sum_q6 = _mm_add_epi16(q[6], q[6]); __m128i sum_p3 = _mm_add_epi16(p[3], p[3]); __m128i sum_q3 = _mm_add_epi16(q[3], q[3]); - sum_q = _mm_sub_epi16(sum_p, p[5]); - sum_p = _mm_sub_epi16(sum_p, q[5]); - flat2_p[1] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), - 4); - flat2_q[1] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), - 4); + + sum_q = _mm_sub_epi16(sum_p_0, p[5]); + __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]); + sum_lq = _mm_sub_epi16(sum_lp, p[2]); sum_lp = _mm_sub_epi16(sum_lp, q[2]); flat_p[1] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3); flat_q[1] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p3 = _mm_add_epi16(sum_p3, p[3]); - sum_q3 = _mm_add_epi16(sum_q3, q[3]); - sum_p = _mm_sub_epi16(sum_p, q[4]); - sum_q = _mm_sub_epi16(sum_q, p[4]); - flat2_p[2] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), - 4); - flat2_q[2] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), - 4); + sum_lp = _mm_sub_epi16(sum_lp, q[1]); sum_lq = _mm_sub_epi16(sum_lq, p[1]); + sum_p3 = _mm_add_epi16(sum_p3, p[3]); + sum_q3 = _mm_add_epi16(sum_q3, q[3]); flat_p[2] = _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3); flat_q[2] = _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[3]); - sum_q = _mm_sub_epi16(sum_q, p[3]); - flat2_p[3] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), - 4); - flat2_q[3] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[2]); - sum_q = _mm_sub_epi16(sum_q, p[2]); - flat2_p[4] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), - 4); - flat2_q[4] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), - 4); - sum_p6 = _mm_add_epi16(sum_p6, p[6]); - sum_q6 = _mm_add_epi16(sum_q6, q[6]); - sum_p = _mm_sub_epi16(sum_p, q[1]); - sum_q = _mm_sub_epi16(sum_q, p[1]); - flat2_p[5] = _mm_srli_epi16( - _mm_add_epi16( - sum_p, _mm_add_epi16( - sum_p6, _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), - 4); - flat2_q[5] = _mm_srli_epi16( - _mm_add_epi16( - sum_q, _mm_add_epi16( - sum_q6, _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), - 4); - } - // highbd_filter8 - p[2] = _mm_andnot_si128(flat, p[2]); - // p2 remains unchanged if !(flat && mask) - flat_p[2] = _mm_and_si128(flat, flat_p[2]); - // when (flat && mask) - p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values - q[2] = _mm_andnot_si128(flat, q[2]); - flat_q[2] = _mm_and_si128(flat, flat_q[2]); - q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values - int i; - for (i = 1; i >= 0; i--) { - ps[i] = _mm_andnot_si128(flat, ps[i]); - flat_p[i] = _mm_and_si128(flat, flat_p[i]); - p[i] = _mm_or_si128(ps[i], flat_p[i]); - qs[i] = _mm_andnot_si128(flat, qs[i]); - flat_q[i] = _mm_and_si128(flat, flat_q[i]); - q[i] = _mm_or_si128(qs[i], flat_q[i]); - } - // highbd_filter16 - for (i = 5; i >= 0; i--) { - // p[i] remains unchanged if !(flat2 && flat && mask) - p[i] = _mm_andnot_si128(flat2, p[i]); - flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); - // get values for when (flat2 && flat && mask) - p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values - q[i] = _mm_andnot_si128(flat2, q[i]); - flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); - q[i] = _mm_or_si128(q[i], flat2_q[i]); + + int flat2_mask = + (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero))); + if (flat2_mask) { + flat2_p[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]), + _mm_add_epi16(p[1], q[0]))), + 4); + flat2_q[0] = _mm_srli_epi16( + _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]), + _mm_add_epi16(p[0], q[1]))), + 4); + + flat2_p[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))), + 4); + flat2_q[1] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[4]); + sum_q = _mm_sub_epi16(sum_q, p[4]); + flat2_p[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))), + 4); + flat2_q[2] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[3]); + sum_q = _mm_sub_epi16(sum_q, p[3]); + flat2_p[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))), + 4); + flat2_q[3] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[2]); + sum_q = _mm_sub_epi16(sum_q, p[2]); + flat2_p[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))), + 4); + flat2_q[4] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))), + 4); + sum_p6 = _mm_add_epi16(sum_p6, p[6]); + sum_q6 = _mm_add_epi16(sum_q6, q[6]); + sum_p = _mm_sub_epi16(sum_p, q[1]); + sum_q = _mm_sub_epi16(sum_q, p[1]); + flat2_p[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))), + 4); + flat2_q[5] = _mm_srli_epi16( + _mm_add_epi16( + sum_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))), + 4); + } + // highbd_filter8 + int i; + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + p[2] = _mm_andnot_si128(flat, p[2]); + // p2 remains unchanged if !(flat && mask) + flat_p[2] = _mm_and_si128(flat, flat_p[2]); + // when (flat && mask) + p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values + q[2] = _mm_andnot_si128(flat, q[2]); + flat_q[2] = _mm_and_si128(flat, flat_q[2]); + q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values + + for (i = 0; i < 2; i++) { + ps[i] = _mm_andnot_si128(flat, ps[i]); + flat_p[i] = _mm_and_si128(flat, flat_p[i]); + p[i] = _mm_or_si128(ps[i], flat_p[i]); + qs[i] = _mm_andnot_si128(flat, qs[i]); + flat_q[i] = _mm_and_si128(flat, flat_q[i]); + q[i] = _mm_or_si128(qs[i], flat_q[i]); + } + // highbd_filter16 + if (flat2_mask) { + for (i = 0; i < 6; i++) { + // p[i] remains unchanged if !(flat2 && flat && mask) + p[i] = _mm_andnot_si128(flat2, p[i]); + flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]); + // get values for when (flat2 && flat && mask) + p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values + q[i] = _mm_andnot_si128(flat2, q[i]); + flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]); + q[i] = _mm_or_si128(q[i], flat2_q[i]); + } + } + } else { + p[0] = ps[0]; + q[0] = qs[0]; + p[1] = ps[1]; + q[1] = qs[1]; } } @@ -696,6 +756,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + // flat_mask flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0); flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8)); @@ -707,53 +770,56 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2( // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); - { - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; // op1 - workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0), - _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2 - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), - *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 - workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0); - workp_shft0 = _mm_add_epi16( - workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4 + workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 // op0 - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1 - workp_a = - _mm_add_epi16(workp_a, - workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4 - - flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_a, workp_shft0), 3); + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + flat_p1p0 = _mm_srli_epi16(workp_b, 3); // oq0 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2), - *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4 - workp_b = _mm_add_epi16(*q1, *q2); - workp_shft0 = _mm_add_epi16( - workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]), + pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); // oq1 - workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1), - *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]), + pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4 workp_b = _mm_add_epi16(*q2, *q2); - workp_shft1 = _mm_add_epi16( - workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 - flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3); - } - // lp filter - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + flat_q0q1 = _mm_srli_epi16(workp_a, 3); - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } } static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( @@ -797,6 +863,17 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + // flat_mask flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0)); flat = _mm_max_epi16(flat, work); @@ -806,7 +883,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // flat & mask - { + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b, workp_shft0, workp_shft1; // op1 @@ -842,33 +921,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2( workp_shft1 = _mm_add_epi16( workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4 oq1 = _mm_srli_epi16(workp_shft1, 3); - } - // lp filter - __m128i ps[2], qs[2], p[2], q[2]; - { - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - // filter_mask and hev_mask - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); - } - - qs[0] = _mm_andnot_si128(flat, qs[0]); - oq0 = _mm_and_si128(flat, oq0); - *q0 = _mm_or_si128(qs[0], oq0); - qs[1] = _mm_andnot_si128(flat, qs[1]); - oq1 = _mm_and_si128(flat, oq1); - *q1 = _mm_or_si128(qs[1], oq1); - - ps[0] = _mm_andnot_si128(flat, ps[0]); - op0 = _mm_and_si128(flat, op0); - *p0 = _mm_or_si128(ps[0], op0); - - ps[1] = _mm_andnot_si128(flat, ps[1]); - op1 = _mm_and_si128(flat, op1); - *p1 = _mm_or_si128(ps[1], op1); + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; + } } void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p, @@ -926,7 +1000,7 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( __m128i mask, hev, flat; __m128i pq[4]; __m128i p1p0, q1q0, ps1ps0, qs1qs0; - __m128i work_a, op2, oq2, flat_p1p0, flat_q0q1; + __m128i work_a, opq2, flat_p1p0, flat_q0q1; pq[0] = _mm_unpacklo_epi64(*p0, *q0); pq[1] = _mm_unpacklo_epi64(*p1, *q1); @@ -944,6 +1018,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit, &thresh, &hev, &mask); + // lp filter + highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd); + // flat_mask4 flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0])); flat = _mm_max_epi16(abs_p1p0, flat); @@ -956,15 +1033,15 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( // replicate for the further "merged variables" usage flat = _mm_unpacklo_epi64(flat, flat); - { - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { + __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1; // Added before shift for rounding part of ROUND_POWER_OF_TWO // o*p2 workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0); - workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); - op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3); + workp_c = _mm_add_epi16(workp_a, workp_c); // o*p1 workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1); @@ -992,27 +1069,22 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2( // oq2 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); - oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - } + workp_a = _mm_add_epi16(workp_a, workp_b); + opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3); - // lp filter - highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0, &t80, bd); - - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - work_a = _mm_andnot_si128(flat, *q2); - *q2 = _mm_and_si128(flat, oq2); - *q2 = _mm_or_si128(work_a, *q2); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); - work_a = _mm_andnot_si128(flat, *p2); - *p2 = _mm_and_si128(flat, op2); - *p2 = _mm_or_si128(work_a, *p2); + work_a = _mm_andnot_si128(flat, pq[2]); + *p2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_a, *p2); + *q2 = _mm_srli_si128(*p2, 8); + } } static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( @@ -1058,17 +1130,28 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( mask = _mm_subs_epu16(mask, limit0); mask = _mm_cmpeq_epi16(mask, zero); + // lp filter + __m128i ps[2], qs[2], p[2], q[2]; + { + p[0] = *p0; + p[1] = *p1; + q[0] = *q0; + q[1] = *q1; + // filter_mask and hev_mask + highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + } + flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0)); flat = _mm_max_epi16(work1, flat); work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0)); flat = _mm_max_epi16(work0, flat); flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8)); - flat = _mm_cmpeq_epi16(flat, zero); flat = _mm_and_si128(flat, mask); // flat & mask - { + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) { __m128i workp_a, workp_b; // Added before shift for rounding part of ROUND_POWER_OF_TWO @@ -1101,42 +1184,36 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2( workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2); oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - } - // lp filter - __m128i ps[2], qs[2], p[2], q[2]; - { - p[0] = *p0; - p[1] = *p1; - q[0] = *q0; - q[1] = *q1; - // filter_mask and hev_mask - highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80); + qs[0] = _mm_andnot_si128(flat, qs[0]); + oq0 = _mm_and_si128(flat, oq0); + *q0 = _mm_or_si128(qs[0], oq0); + + qs[1] = _mm_andnot_si128(flat, qs[1]); + oq1 = _mm_and_si128(flat, oq1); + *q1 = _mm_or_si128(qs[1], oq1); + + ps[0] = _mm_andnot_si128(flat, ps[0]); + op0 = _mm_and_si128(flat, op0); + *p0 = _mm_or_si128(ps[0], op0); + + ps[1] = _mm_andnot_si128(flat, ps[1]); + op1 = _mm_and_si128(flat, op1); + *p1 = _mm_or_si128(ps[1], op1); + + work_a = _mm_andnot_si128(flat, *q2); + *q2 = _mm_and_si128(flat, oq2); + *q2 = _mm_or_si128(work_a, *q2); + + work_a = _mm_andnot_si128(flat, *p2); + *p2 = _mm_and_si128(flat, op2); + *p2 = _mm_or_si128(work_a, *p2); + } else { + *q0 = qs[0]; + *q1 = qs[1]; + *p0 = ps[0]; + *p1 = ps[1]; } - - qs[0] = _mm_andnot_si128(flat, qs[0]); - oq0 = _mm_and_si128(flat, oq0); - *q0 = _mm_or_si128(qs[0], oq0); - - qs[1] = _mm_andnot_si128(flat, qs[1]); - oq1 = _mm_and_si128(flat, oq1); - *q1 = _mm_or_si128(qs[1], oq1); - - ps[0] = _mm_andnot_si128(flat, ps[0]); - op0 = _mm_and_si128(flat, op0); - *p0 = _mm_or_si128(ps[0], op0); - - ps[1] = _mm_andnot_si128(flat, ps[1]); - op1 = _mm_and_si128(flat, op1); - *p1 = _mm_or_si128(ps[1], op1); - - work_a = _mm_andnot_si128(flat, *q2); - *q2 = _mm_and_si128(flat, oq2); - *q2 = _mm_or_si128(work_a, *q2); - - work_a = _mm_andnot_si128(flat, *p2); - *p2 = _mm_and_si128(flat, op2); - *p2 = _mm_or_si128(work_a, *p2); } void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c index dea113a29..b9689202a 100644 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c @@ -110,7 +110,7 @@ static INLINE void quantize(const __m256i *qp, __m256i *c, } void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -120,12 +120,23 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, (void)scan; const unsigned int step = 8; - if (LIKELY(!skip_block)) { - __m256i qp[5], coeff; - init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp); - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + __m256i qp[5], coeff; + init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp); + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); + + __m256i eob = _mm256_setzero_si256(); + quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); + + coeff_ptr += step; + qcoeff_ptr += step; + dqcoeff_ptr += step; + iscan += step; + n_coeffs -= step; + + update_qp(qp); - __m256i eob = _mm256_setzero_si256(); + while (n_coeffs > 0) { + coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); coeff_ptr += step; @@ -133,40 +144,17 @@ void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr += step; iscan += step; n_coeffs -= step; - - update_qp(qp); - - while (n_coeffs > 0) { - coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr); - quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob); - - coeff_ptr += step; - qcoeff_ptr += step; - dqcoeff_ptr += step; - iscan += step; - n_coeffs -= step; - } - { - __m256i eob_s; - eob_s = _mm256_shuffle_epi32(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 0xe); - eob = _mm256_max_epi16(eob, eob_s); - eob_s = _mm256_shufflelo_epi16(eob, 1); - eob = _mm256_max_epi16(eob, eob_s); - const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), - _mm256_extractf128_si256(eob, 1)); - *eob_ptr = _mm_extract_epi16(final_eob, 0); - } - } else { - do { - const __m256i zero = _mm256_setzero_si256(); - _mm256_storeu_si256((__m256i *)qcoeff_ptr, zero); - _mm256_storeu_si256((__m256i *)dqcoeff_ptr, zero); - qcoeff_ptr += step; - dqcoeff_ptr += step; - n_coeffs -= step; - } while (n_coeffs > 0); - *eob_ptr = 0; + } + { + __m256i eob_s; + eob_s = _mm256_shuffle_epi32(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 0xe); + eob = _mm256_max_epi16(eob, eob_s); + eob_s = _mm256_shufflelo_epi16(eob, 1); + eob = _mm256_max_epi16(eob, eob_s); + const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob), + _mm256_extractf128_si256(eob, 1)); + *eob_ptr = _mm_extract_epi16(final_eob, 0); } } diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c index 5570ca5b7..58e5f98e5 100644 --- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -16,7 +16,7 @@ #include "aom_ports/mem.h" void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, - int skip_block, const int16_t *zbin_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, @@ -41,50 +41,48 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = ((int)count / 4) - 1; i >= 0; i--) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (test == 0xffff) - non_zero_regs--; - else - break; - } + // Pre-scan pass + for (i = ((int)count / 4) - 1; i >= 0; i--) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (test == 0xffff) + non_zero_regs--; + else + break; + } - // Quantization pass: - for (i = 0; i < non_zero_regs; i++) { - __m128i coeffs, coeffs_sign, tmp1, tmp2; - int test; - int abs_coeff[4]; - int coeff_sign[4]; - - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - coeffs_sign = _mm_srai_epi32(coeffs, 31); - coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); - tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); - tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); - tmp1 = _mm_or_si128(tmp1, tmp2); - test = _mm_movemask_epi8(tmp1); - _mm_storeu_si128((__m128i *)abs_coeff, coeffs); - _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); - - for (j = 0; j < 4; j++) { - if (test & (1 << (4 * j))) { - int k = 4 * i + j; - const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; - const uint32_t abs_qcoeff = - (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); - qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; - dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; - if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; - } + // Quantization pass: + for (i = 0; i < non_zero_regs; i++) { + __m128i coeffs, coeffs_sign, tmp1, tmp2; + int test; + int abs_coeff[4]; + int coeff_sign[4]; + + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + coeffs_sign = _mm_srai_epi32(coeffs, 31); + coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign); + tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]); + tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]); + tmp1 = _mm_or_si128(tmp1, tmp2); + test = _mm_movemask_epi8(tmp1); + _mm_storeu_si128((__m128i *)abs_coeff, coeffs); + _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign); + + for (j = 0; j < 4; j++) { + if (test & (1 << (4 * j))) { + int k = 4 * i + j; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; + const uint32_t abs_qcoeff = + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); + qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; + dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; + if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; } } } @@ -92,8 +90,8 @@ void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } void aom_highbd_quantize_b_32x32_sse2( - const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, - const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { @@ -116,38 +114,35 @@ void aom_highbd_quantize_b_32x32_sse2( memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); - if (!skip_block) { - // Pre-scan pass - for (i = 0; i < n_coeffs / 4; i++) { - __m128i coeffs, cmp1, cmp2; - int test; - coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); - cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); - cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); - cmp1 = _mm_and_si128(cmp1, cmp2); - test = _mm_movemask_epi8(cmp1); - if (!(test & 0xf)) idx_arr[idx++] = i * 4; - if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; - if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; - if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; - } + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } - // Quantization pass: only process the coefficients selected in - // pre-scan pass. Note: idx can be zero. - for (i = 0; i < idx; i++) { - const int rc = idx_arr[i]; - const int coeff = coeff_ptr[rc]; - const int coeff_sign = (coeff >> 31); - const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - const int64_t tmp1 = - abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); - const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; - const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); - qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; - dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; - if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; - } + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } *eob_ptr = eob + 1; } diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c new file mode 100644 index 000000000..9b1b4c9de --- /dev/null +++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include // AVX2 + +#include "config/aom_dsp_rtcd.h" + +typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum); + +void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + for (int i = 0; i < 8; i += 2) { + const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src); + const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); + const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref); + const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride)); + __m256i v_p_a = _mm256_castsi128_si256(v_p_a0); + __m256i v_p_b = _mm256_castsi128_si256(v_p_b0); + v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1); + v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride * 2; + ref += ref_stride * 2; + } + __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d)); + __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1)); + __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + const __m256i one = _mm256_set1_epi16(1); + for (int i = 0; i < 16; ++i) { + const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src); + const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref); + const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); + const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); + v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); + src += src_stride; + ref += ref_stride; + } + __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one); + __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); + __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); + __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); + const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); + const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); + __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); + v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); + *sum = _mm_extract_epi32(v_d, 0); + *sse = _mm_extract_epi32(v_d, 1); +} + +static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, int w, + int h, uint32_t *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int32_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, + ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); +} + +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_avx2( \ + src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +VAR_FN(128, 128, 16, 14); +VAR_FN(128, 64, 16, 13); +VAR_FN(64, 128, 16, 13); +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); +VAR_FN(16, 4, 16, 6); +VAR_FN(8, 32, 8, 8); +VAR_FN(32, 8, 8, 8); +VAR_FN(16, 64, 16, 10); +VAR_FN(64, 16, 16, 10); + +#undef VAR_FN diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index 131c16aa9..47b052abc 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -179,6 +179,9 @@ HIGH_GET_VAR(8); return (var >= 0) ? (uint32_t)var : 0; \ } +VAR_FN(128, 128, 16, 14); +VAR_FN(128, 64, 16, 13); +VAR_FN(64, 128, 16, 13); VAR_FN(64, 64, 16, 12); VAR_FN(64, 32, 16, 11); VAR_FN(32, 64, 16, 11); @@ -590,10 +593,10 @@ FNS(sse2); void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, - uint16_t *comp_pred, int width, int height, + uint8_t *comp_pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref8, int ref_stride, - int bd) { + const uint8_t *ref8, int ref_stride, int bd, + int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; @@ -606,8 +609,6 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, if (is_scaled) { // Note: This is mostly a copy from the >=8X8 case in // build_inter_predictors() function, with some small tweaks. - uint8_t *comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred); - // Some assumptions. const int plane = 0; @@ -661,7 +662,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); const InterpFilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); @@ -677,10 +678,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, } const InterpFilterParams *filter = - av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); if (!subpel_x_q3 && !subpel_y_q3) { uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); if (width >= 8) { int i; assert(!(width & 7)); @@ -711,13 +715,13 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, } else if (!subpel_y_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); - aom_highbd_convolve8_horiz(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, kernel, 16, NULL, -1, width, height, bd); + aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16, + NULL, -1, width, height, bd); } else if (!subpel_x_q3) { const int16_t *const kernel = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - aom_highbd_convolve8_vert(ref8, ref_stride, CONVERT_TO_BYTEPTR(comp_pred), - width, NULL, -1, kernel, 16, width, height, bd); + aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1, + kernel, 16, width, height, bd); } else { DECLARE_ALIGNED(16, uint16_t, temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]); @@ -734,30 +738,29 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, intermediate_height, bd); aom_highbd_convolve8_vert( CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)), - MAX_SB_SIZE, CONVERT_TO_BYTEPTR(comp_pred), width, NULL, -1, kernel_y, - 16, width, height, bd); + MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height, + bd); } } void aom_highbd_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd) { - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - int n; - int i; - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + int ref_stride, int bd, int subpel_search) { + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); + bd, subpel_search); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/ assert(!(width * height & 7)); - n = width * height >> 3; - for (i = 0; i < n; i++) { - __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred); + int n = width * height >> 3; + for (int i = 0; i < n; i++) { + __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16); __m128i p0 = _mm_loadu_si128((const __m128i *)pred); - _mm_storeu_si128((__m128i *)comp_pred, _mm_avg_epu16(s0, p0)); - comp_pred += 8; + _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0)); + comp_pred16 += 8; pred += 8; } } @@ -777,7 +780,7 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, xx_storeu_128(result, shift); } -void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, +void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, @@ -792,6 +795,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, _mm_set_epi16(round, round, round, round, round, round, round, round); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); if (width >= 8) { // Read 8 pixels one row at a time @@ -830,15 +834,16 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint16_t *comp_pred, void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, - const MV *const mv, uint16_t *comp_pred, const uint8_t *pred8, int width, + const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param) { + int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, + int subpel_search) { uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); int n; int i; - aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, + aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width, height, subpel_x_q3, subpel_y_q3, ref8, ref_stride, - bd); + bd, subpel_search); assert(!(width * height & 7)); n = width * height >> 3; @@ -850,13 +855,14 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( const __m128i r = _mm_set_epi16(round, round, round, round, round, round, round, round); + uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8); for (i = 0; i < n; i++) { - __m128i p0 = xx_loadu_128(comp_pred); + __m128i p0 = xx_loadu_128(comp_pred16); __m128i p1 = xx_loadu_128(pred); - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); - comp_pred += 8; + comp_pred16 += 8; pred += 8; } } diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c index 6c247a91b..df5449a9d 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c @@ -168,8 +168,8 @@ uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); - aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), - 4); + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); @@ -188,8 +188,8 @@ uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); - aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), - 4); + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); @@ -208,8 +208,8 @@ uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, bilinear_filters_2t[yoffset]); - aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2), - 4); + aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, sse); diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c index eaf1f347b..f9a41a210 100644 --- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c +++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c @@ -120,11 +120,11 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param) { + int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { int n; int i; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ assert(!(width * height & 15)); n = width * height >> 4; diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c deleted file mode 100644 index 18862dd3e..000000000 --- a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c +++ /dev/null @@ -1,916 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include /* AVX2 */ - -#include "config/aom_dsp_rtcd.h" - -#include "aom_ports/mem.h" - -void aom_lpf_horizontal_16_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; - __m128i abs_p1p0; - - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); - - q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); - q4p4 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); - q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - q3p3 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); - q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - q2p2 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); - q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - q1p1 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); - p1q1 = _mm_shuffle_epi32(q1p1, 78); - q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0p0 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); - p0q0 = _mm_shuffle_epi32(q0p0, 78); - - { - __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; - abs_p1p0 = - _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); - abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); - fe = _mm_set1_epi8(0xfe); - ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - abs_p0q0 = - _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); - abs_p1q1 = - _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(abs_p1p0, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), - _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i t1 = _mm_set1_epi16(0x1); - __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); - __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); - __m128i qs0 = _mm_xor_si128(p0q0, t80); - __m128i qs1 = _mm_xor_si128(p1q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; - __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - - filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, qs0ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (aom_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - filter1 = _mm_unpacklo_epi8(zero, filter1); - filter1 = _mm_srai_epi16(filter1, 0xB); - filter2 = _mm_unpacklo_epi8(zero, filter2); - filter2 = _mm_srai_epi16(filter2, 0xB); - - /* Filter1 >> 3 */ - filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); - qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); - - /* filt >> 1 */ - filt = _mm_adds_epi16(filter1, t1); - filt = _mm_srai_epi16(filt, 1); - filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), - filt); - filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); - qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); - // loopfilter done - - { - __m128i work; - flat = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), - _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); - q5p5 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); - - q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); - q6p6 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); - - flat2 = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), - _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); - - q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); - q7p7 = _mm_castps_si128( - _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); - - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), - _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); - - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m128i eight = _mm_set1_epi16(8); - const __m128i four = _mm_set1_epi16(4); - __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; - __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; - __m128i pixelFilter_p, pixelFilter_q; - __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; - __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - p7_16 = _mm_unpacklo_epi8(q7p7, zero); - p6_16 = _mm_unpacklo_epi8(q6p6, zero); - p5_16 = _mm_unpacklo_epi8(q5p5, zero); - p4_16 = _mm_unpacklo_epi8(q4p4, zero); - p3_16 = _mm_unpacklo_epi8(q3p3, zero); - p2_16 = _mm_unpacklo_epi8(q2p2, zero); - p1_16 = _mm_unpacklo_epi8(q1p1, zero); - p0_16 = _mm_unpacklo_epi8(q0p0, zero); - q0_16 = _mm_unpackhi_epi8(q0p0, zero); - q1_16 = _mm_unpackhi_epi8(q1p1, zero); - q2_16 = _mm_unpackhi_epi8(q2p2, zero); - q3_16 = _mm_unpackhi_epi8(q3p3, zero); - q4_16 = _mm_unpackhi_epi8(q4p4, zero); - q5_16 = _mm_unpackhi_epi8(q5p5, zero); - q6_16 = _mm_unpackhi_epi8(q6p6, zero); - q7_16 = _mm_unpackhi_epi8(q7p7, zero); - - pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), - _mm_add_epi16(p4_16, p3_16)); - pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), - _mm_add_epi16(q4_16, q3_16)); - - pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); - pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); - pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - pixelFilter_p = - _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); - pixetFilter_p2p1p0 = _mm_add_epi16( - four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); - flat2_q0p0 = _mm_packus_epi16(res_p, res_q); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); - - flat_q0p0 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(p7_16, p7_16); - sum_q7 = _mm_add_epi16(q7_16, q7_16); - sum_p3 = _mm_add_epi16(p3_16, p3_16); - sum_q3 = _mm_add_epi16(q3_16, q3_16); - - pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); - flat2_q1p1 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); - flat_q1p1 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); - pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); - flat_q2p2 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); - - sum_p7 = _mm_add_epi16(sum_p7, p7_16); - sum_q7 = _mm_add_epi16(sum_q7, q7_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); - res_p = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); - res_q = _mm_srli_epi16( - _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); - flat2_q6p6 = _mm_packus_epi16(res_p, res_q); - } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); - - q2p2 = _mm_andnot_si128(flat, q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - q2p2 = _mm_or_si128(q2p2, flat_q2p2); - - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); - - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); - - q6p6 = _mm_andnot_si128(flat2, q6p6); - flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); - q6p6 = _mm_or_si128(q6p6, flat2_q6p6); - _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); - _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); - - q5p5 = _mm_andnot_si128(flat2, q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - q5p5 = _mm_or_si128(q5p5, flat2_q5p5); - _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); - _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); - - q4p4 = _mm_andnot_si128(flat2, q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - q4p4 = _mm_or_si128(q4p4, flat2_q4p4); - _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); - _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); - - q3p3 = _mm_andnot_si128(flat2, q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - q3p3 = _mm_or_si128(q3p3, flat2_q3p3); - _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); - _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); - - q2p2 = _mm_andnot_si128(flat2, q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - q2p2 = _mm_or_si128(q2p2, flat2_q2p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); - _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); - - q1p1 = _mm_andnot_si128(flat2, q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - q1p1 = _mm_or_si128(q1p1, flat2_q1p1); - _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); - _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); - - q0p0 = _mm_andnot_si128(flat2, q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - q0p0 = _mm_or_si128(q0p0, flat2_q0p0); - _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); - _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); - } -} - -DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { - 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, - 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 -}; - -void aom_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; - const __m128i zero = _mm_set1_epi16(0); - const __m128i one = _mm_set1_epi8(1); - __m128i p7, p6, p5; - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; - __m128i q5, q6, q7; - __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, - p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; - - const __m128i thresh = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); - const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); - const __m128i blimit = - _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); - - p256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); - p256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); - p256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); - p256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); - p256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); - q256_0 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); - q256_1 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); - q256_2 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); - q256_3 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); - q256_4 = - _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); - - p4 = _mm256_castsi256_si128(p256_4); - p3 = _mm256_castsi256_si128(p256_3); - p2 = _mm256_castsi256_si128(p256_2); - p1 = _mm256_castsi256_si128(p256_1); - p0 = _mm256_castsi256_si128(p256_0); - q0 = _mm256_castsi256_si128(q256_0); - q1 = _mm256_castsi256_si128(q256_1); - q2 = _mm256_castsi256_si128(q256_2); - q3 = _mm256_castsi256_si128(q256_3); - q4 = _mm256_castsi256_si128(q256_4); - - { - const __m128i abs_p1p0 = - _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = - _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = - _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = - _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); - __m128i work; - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - } - - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - __m128i ps1 = _mm_xor_si128(p1, t80); - __m128i ps0 = _mm_xor_si128(p0, t80); - __m128i qs0 = _mm_xor_si128(q0, t80); - __m128i qs1 = _mm_xor_si128(q1, t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, - flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, - flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (aom_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - /* Filter1 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - - /* Filter2 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - - /* filt >> 1 */ - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - filt = _mm_andnot_si128(hev, filt); - ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - // loopfilter done - - { - __m128i work; - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - - p256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); - q256_5 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); - p5 = _mm256_castsi256_si128(p256_5); - q5 = _mm256_castsi256_si128(q256_5); - flat2 = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), - _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); - - flat2 = _mm_max_epu8(work, flat2); - p256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); - q256_6 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); - p6 = _mm256_castsi256_si128(p256_6); - q6 = _mm256_castsi256_si128(q256_6); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), - _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); - - flat2 = _mm_max_epu8(work, flat2); - - p256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); - q256_7 = _mm256_castpd_si256( - _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); - p7 = _mm256_castsi256_si128(p256_7); - q7 = _mm256_castsi256_si128(q256_7); - work = _mm_max_epu8( - _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), - _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); - - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { - const __m256i eight = _mm256_set1_epi16(8); - const __m256i four = _mm256_set1_epi16(4); - __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, - pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; - - const __m256i filter = - _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); - p256_7 = _mm256_shuffle_epi8(p256_7, filter); - p256_6 = _mm256_shuffle_epi8(p256_6, filter); - p256_5 = _mm256_shuffle_epi8(p256_5, filter); - p256_4 = _mm256_shuffle_epi8(p256_4, filter); - p256_3 = _mm256_shuffle_epi8(p256_3, filter); - p256_2 = _mm256_shuffle_epi8(p256_2, filter); - p256_1 = _mm256_shuffle_epi8(p256_1, filter); - p256_0 = _mm256_shuffle_epi8(p256_0, filter); - q256_0 = _mm256_shuffle_epi8(q256_0, filter); - q256_1 = _mm256_shuffle_epi8(q256_1, filter); - q256_2 = _mm256_shuffle_epi8(q256_2, filter); - q256_3 = _mm256_shuffle_epi8(q256_3, filter); - q256_4 = _mm256_shuffle_epi8(q256_4, filter); - q256_5 = _mm256_shuffle_epi8(q256_5, filter); - q256_6 = _mm256_shuffle_epi8(q256_6, filter); - q256_7 = _mm256_shuffle_epi8(q256_7, filter); - - pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), - _mm256_add_epi16(p256_4, p256_3)); - pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), - _mm256_add_epi16(q256_4, q256_3)); - - pixetFilter_p2p1p0 = - _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); - pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); - - pixetFilter_q2q1q0 = - _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); - pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); - - pixelFilter_p = _mm256_add_epi16( - eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); - - pixetFilter_p2p1p0 = _mm256_add_epi16( - four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4); - - flat2_p0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4); - - flat2_q0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - res_p = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(p256_3, p256_0)), - 3); - - flat_p0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(q256_3, q256_0)), - 3); - - flat_q0 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(p256_7, p256_7); - - sum_q7 = _mm256_add_epi16(q256_7, q256_7); - - sum_p3 = _mm256_add_epi16(p256_3, p256_3); - - sum_q3 = _mm256_add_epi16(q256_3, q256_3); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4); - - flat2_p1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4); - - flat2_q1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); - - pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); - - res_p = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(sum_p3, p256_1)), - 3); - - flat_p1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, - _mm256_add_epi16(sum_q3, q256_1)), - 3); - - flat_q1 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - sum_p3 = _mm256_add_epi16(sum_p3, p256_3); - - sum_q3 = _mm256_add_epi16(sum_q3, q256_3); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4); - - flat2_p2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4); - - flat2_q2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); - - pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); - - res_p = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, - _mm256_add_epi16(sum_p3, p256_2)), - 3); - - flat_p2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = - _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, - _mm256_add_epi16(sum_q3, q256_2)), - 3); - - flat_q2 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4); - - flat2_p3 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4); - - flat2_q3 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4); - - flat2_p4 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4); - - flat2_q4 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4); - - flat2_p5 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4); - - flat2_q5 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - - sum_p7 = _mm256_add_epi16(sum_p7, p256_7); - - sum_q7 = _mm256_add_epi16(sum_q7, q256_7); - - pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); - - pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); - - res_p = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4); - - flat2_p6 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); - - res_q = _mm256_srli_epi16( - _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4); - - flat2_q6 = _mm256_castsi256_si128( - _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); - } - - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - p2 = _mm_andnot_si128(flat, p2); - flat_p2 = _mm_and_si128(flat, flat_p2); - p2 = _mm_or_si128(flat_p2, p2); - - p1 = _mm_andnot_si128(flat, ps1); - flat_p1 = _mm_and_si128(flat, flat_p1); - p1 = _mm_or_si128(flat_p1, p1); - - p0 = _mm_andnot_si128(flat, ps0); - flat_p0 = _mm_and_si128(flat, flat_p0); - p0 = _mm_or_si128(flat_p0, p0); - - q0 = _mm_andnot_si128(flat, qs0); - flat_q0 = _mm_and_si128(flat, flat_q0); - q0 = _mm_or_si128(flat_q0, q0); - - q1 = _mm_andnot_si128(flat, qs1); - flat_q1 = _mm_and_si128(flat, flat_q1); - q1 = _mm_or_si128(flat_q1, q1); - - q2 = _mm_andnot_si128(flat, q2); - flat_q2 = _mm_and_si128(flat, flat_q2); - q2 = _mm_or_si128(flat_q2, q2); - - p6 = _mm_andnot_si128(flat2, p6); - flat2_p6 = _mm_and_si128(flat2, flat2_p6); - p6 = _mm_or_si128(flat2_p6, p6); - _mm_storeu_si128((__m128i *)(s - 7 * p), p6); - - p5 = _mm_andnot_si128(flat2, p5); - flat2_p5 = _mm_and_si128(flat2, flat2_p5); - p5 = _mm_or_si128(flat2_p5, p5); - _mm_storeu_si128((__m128i *)(s - 6 * p), p5); - - p4 = _mm_andnot_si128(flat2, p4); - flat2_p4 = _mm_and_si128(flat2, flat2_p4); - p4 = _mm_or_si128(flat2_p4, p4); - _mm_storeu_si128((__m128i *)(s - 5 * p), p4); - - p3 = _mm_andnot_si128(flat2, p3); - flat2_p3 = _mm_and_si128(flat2, flat2_p3); - p3 = _mm_or_si128(flat2_p3, p3); - _mm_storeu_si128((__m128i *)(s - 4 * p), p3); - - p2 = _mm_andnot_si128(flat2, p2); - flat2_p2 = _mm_and_si128(flat2, flat2_p2); - p2 = _mm_or_si128(flat2_p2, p2); - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - - p1 = _mm_andnot_si128(flat2, p1); - flat2_p1 = _mm_and_si128(flat2, flat2_p1); - p1 = _mm_or_si128(flat2_p1, p1); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - - p0 = _mm_andnot_si128(flat2, p0); - flat2_p0 = _mm_and_si128(flat2, flat2_p0); - p0 = _mm_or_si128(flat2_p0, p0); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - - q0 = _mm_andnot_si128(flat2, q0); - flat2_q0 = _mm_and_si128(flat2, flat2_q0); - q0 = _mm_or_si128(flat2_q0, q0); - _mm_storeu_si128((__m128i *)(s - 0 * p), q0); - - q1 = _mm_andnot_si128(flat2, q1); - flat2_q1 = _mm_and_si128(flat2, flat2_q1); - q1 = _mm_or_si128(flat2_q1, q1); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - - q2 = _mm_andnot_si128(flat2, q2); - flat2_q2 = _mm_and_si128(flat2, flat2_q2); - q2 = _mm_or_si128(flat2_q2, q2); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); - - q3 = _mm_andnot_si128(flat2, q3); - flat2_q3 = _mm_and_si128(flat2, flat2_q3); - q3 = _mm_or_si128(flat2_q3, q3); - _mm_storeu_si128((__m128i *)(s + 3 * p), q3); - - q4 = _mm_andnot_si128(flat2, q4); - flat2_q4 = _mm_and_si128(flat2, flat2_q4); - q4 = _mm_or_si128(flat2_q4, q4); - _mm_storeu_si128((__m128i *)(s + 4 * p), q4); - - q5 = _mm_andnot_si128(flat2, q5); - flat2_q5 = _mm_and_si128(flat2, flat2_q5); - q5 = _mm_or_si128(flat2_q5, q5); - _mm_storeu_si128((__m128i *)(s + 5 * p), q5); - - q6 = _mm_andnot_si128(flat2, q6); - flat2_q6 = _mm_and_si128(flat2, flat2_q6); - q6 = _mm_or_si128(flat2_q6, q6); - _mm_storeu_si128((__m128i *)(s + 6 * p), q6); - } - _mm256_zeroupper(); -} diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c index f1eac233b..9d88b5e49 100644 --- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c +++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c @@ -249,6 +249,63 @@ static INLINE void transpose16x8_8x16_sse2( *d7 = _mm_unpackhi_epi64(w7, w15); } +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them independently while flipping the second matrix horizontaly Used for 14 +// taps filter pq pairs inverse +static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, + __m128i *pq0, __m128i *pq1, + __m128i *pq2, __m128i *pq3) { + __m128i w10, w11, w12, w13; + __m128i w0, w1, w2, w3, w4, w5; + __m128i d0, d1, d2, d3; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w10 = _mm_unpacklo_epi8( + *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13 + w11 = _mm_unpacklo_epi8( + *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33 + w12 = _mm_unpacklo_epi8( + *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53 + w13 = _mm_unpacklo_epi8( + *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73 + + w4 = _mm_unpackhi_epi16( + w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpackhi_epi16( + w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + *pq0 = _mm_unpacklo_epi64(d0, d1); // pq + *pq1 = _mm_unpackhi_epi64(d0, d1); // pq + *pq2 = _mm_unpacklo_epi64(d2, d3); // pq + *pq3 = _mm_unpackhi_epi64(d2, d3); // pq +} + static INLINE void transpose8x16_16x8_sse2( __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, @@ -300,9 +357,120 @@ static INLINE void transpose8x16_16x8_sse2( *d14d15 = _mm_unpackhi_epi64(w7, w15); } +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them to 4x8 independently while flipping the second matrix horizontaly. Used +// for 14 taps pq pairs creation +static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *q0p0, + __m128i *q1p1, __m128i *q2p2, + __m128i *q3p3, __m128i *q4p4, + __m128i *q5p5, __m128i *q6p6, + __m128i *q7p7) { + __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi8( + *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 + w3 = _mm_unpackhi_epi8( + *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ww2 = _mm_unpacklo_epi16( + w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 + ww3 = _mm_unpackhi_epi16( + w2, + w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 + + *q7p7 = _mm_unpacklo_epi32( + ww0, + _mm_srli_si128( + ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx + *q6p6 = _mm_unpackhi_epi32( + _mm_slli_si128(ww0, 4), + ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx + *q5p5 = _mm_unpackhi_epi32( + ww0, + _mm_slli_si128( + ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx + *q4p4 = _mm_unpacklo_epi32( + _mm_srli_si128(ww0, 12), + ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx + *q3p3 = _mm_unpacklo_epi32( + ww1, + _mm_srli_si128( + ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx + *q2p2 = _mm_unpackhi_epi32( + _mm_slli_si128(ww1, 4), + ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx + *q1p1 = _mm_unpackhi_epi32( + ww1, + _mm_slli_si128( + ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx + *q0p0 = _mm_unpacklo_epi32( + _mm_srli_si128(ww1, 12), + ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx +} + static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *hev, __m128i *mask, __m128i *qs1qs0, __m128i *ps1ps0) { + __m128i filter, filter2filter1, work; + __m128i ps1ps0_work, qs1qs0_work; + __m128i hev1; + const __m128i t3t4 = + _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i ff = _mm_cmpeq_epi8(t80, t80); + + ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */ + qs1qs0_work = _mm_xor_si128(*q1q0, t80); + + /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ + work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work); + filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev); + /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); + filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ + filter = _mm_and_si128(filter, *mask); /* & mask */ + filter = _mm_unpacklo_epi32(filter, filter); + + /* filter1 = signed_char_clamp(filter + 4) >> 3; */ + /* filter2 = signed_char_clamp(filter + 3) >> 3; */ + filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ + filter2filter1 = + _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit + filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ + filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1); + + /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ + filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ + filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit + filter = _mm_srai_epi16(filter, 9); /* round */ + filter = _mm_packs_epi16(filter, filter); + filter = _mm_andnot_si128(*hev, filter); + filter = _mm_unpacklo_epi32(filter, filter); + + filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter); + hev1 = _mm_srli_si128(filter2filter1, 8); + /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ + qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1); + /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ + ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1); + + *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */ + *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */ +} + +static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0, + __m128i *hev, __m128i *mask, + __m128i *qs1qs0, + __m128i *ps1ps0) { const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); const __m128i t80 = _mm_set1_epi8(0x80); @@ -356,6 +524,49 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2( __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { __m128i q1p1, q0p0, p1p0, q1q0; __m128i abs_p0q0, abs_p1q1; + __m128i mask, flat, hev; + const __m128i zero = _mm_setzero_si128(); + + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); + + /* (abs(q1 - q0), abs(p1 - p0) */ + flat = abs_diff(q1p1, q0p0); + /* abs(p1 - q1), abs(p0 - q0) */ + __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); + + /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + hev = _mm_unpacklo_epi8(flat, zero); + + hev = _mm_cmpgt_epi16(hev, *thresh); + hev = _mm_packs_epi16(hev, hev); + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ + abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */ + abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1); + abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); + abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ + /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ + + mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); + mask = _mm_unpacklo_epi32(mask, flat); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4)); + + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); +} + +static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2( + __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit, + __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) { + __m128i q1p1, q0p0, p1p0, q1q0; + __m128i abs_p0q0, abs_p1q1; __m128i mask, hev; const __m128i zero = _mm_setzero_si128(); @@ -390,14 +601,14 @@ static AOM_FORCE_INLINE void lpf_internal_4_sse2( mask = _mm_cmpeq_epi8(mask, zero); mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); - filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); } void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, const uint8_t *_blimit, const uint8_t *_limit, const uint8_t *_thresh) { const __m128i zero = _mm_setzero_si128(); - __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), _mm_loadl_epi64((const __m128i *)_limit)); __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); @@ -413,9 +624,9 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0); xx_storel_32(s - 1 * p, ps1ps0); - xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 8)); + xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4)); xx_storel_32(s + 0 * p, qs1qs0); - xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 8)); + xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4)); } void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, @@ -425,7 +636,7 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, __m128i p1, p0, q0, q1; const __m128i zero = _mm_setzero_si128(); - __m128i limit = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), + __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit), _mm_loadl_epi64((const __m128i *)_limit)); __m128i thresh = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); @@ -442,8 +653,8 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0); // Transpose 8x4 to 4x8 - p1 = _mm_srli_si128(p1p0, 8); - q1 = _mm_srli_si128(q1q0, 8); + p1 = _mm_srli_si128(p1p0, 4); + q1 = _mm_srli_si128(q1q0, 4); transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3); @@ -455,10 +666,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) { xx_storel_32(s - (num + 1) * p, x); - xx_storel_32(s + num * p, _mm_srli_si128(x, 8)); + xx_storel_32(s + num * p, _mm_srli_si128(x, 4)); } -static AOM_FORCE_INLINE void lpf_internal_14_sse2( +static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2( __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, __m128i *thresh) { @@ -503,38 +714,31 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2( mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - // replicate for the further "merged variables" usage - mask = _mm_unpacklo_epi64(mask, mask); } // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0); qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0); // loopfilter done __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; - { - __m128i work; - flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); - flat = _mm_max_epu8(abs_p1p0, flat); - flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); - work = abs_diff(*q6p6, *q0p0); - flat2 = _mm_max_epu8(work, flat2); - flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); - flat2 = _mm_subs_epu8(flat2, one); - flat2 = _mm_cmpeq_epi8(flat2, zero); - flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - } - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // flat and wide flat calculations - { + __m128i work; + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; @@ -619,137 +823,413 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); flat_q1p1 = _mm_packus_epi16(res_p, res_q); - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - sum_p3 = _mm_add_epi16(sum_p3, p3_16); - sum_q3 = _mm_add_epi16(sum_q3, q3_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), - 4); - flat2_q2p2 = _mm_packus_epi16(res_p, res_q); - pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); + sum_p3 = _mm_add_epi16(sum_p3, p3_16); + sum_q3 = _mm_add_epi16(sum_q3, q3_16); + res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); flat_q2p2 = _mm_packus_epi16(res_p, res_q); - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), - 4); - flat2_q3p3 = _mm_packus_epi16(res_p, res_q); - - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); - - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), - 4); - flat2_q4p4 = _mm_packus_epi16(res_p, res_q); - - sum_p6 = _mm_add_epi16(sum_p6, p6_16); - sum_q6 = _mm_add_epi16(sum_q6, q6_16); - pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); - pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + // work with flat2 + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask - res_p = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_p, - _mm_add_epi16(sum_p6, - _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), - 4); - res_q = _mm_srli_epi16( - _mm_add_epi16( - pixelFilter_q, - _mm_add_epi16(sum_q6, - _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), - 4); - flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat = _mm_unpacklo_epi64(flat, flat); + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_q2p2 = _mm_and_si128(flat, flat_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_q1p1 = _mm_and_si128(flat, flat_q1p1); + *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_q0p0 = _mm_and_si128(flat, flat_q0p0); + *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))), + 4); + flat2_q2p2 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))), + 4); + flat2_q3p3 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))), + 4); + flat2_q4p4 = _mm_packus_epi16(res_p, res_q); + + sum_p6 = _mm_add_epi16(sum_p6, p6_16); + sum_q6 = _mm_add_epi16(sum_q6, q6_16); + pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); + pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); + + res_p = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_p, + _mm_add_epi16(sum_p6, + _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))), + 4); + res_q = _mm_srli_epi16( + _mm_add_epi16( + pixelFilter_q, + _mm_add_epi16(sum_q6, + _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))), + 4); + flat2_q5p5 = _mm_packus_epi16(res_p, res_q); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_unpacklo_epi64(flat2, flat2); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); + *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); + *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); + *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); + *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); + *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); + *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; } - // wide flat - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +} - flat = _mm_shuffle_epi32(flat, 68); - flat2 = _mm_shuffle_epi32(flat2, 68); +static AOM_FORCE_INLINE void lpf_internal_14_sse2( + __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2, + __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi8(1); + __m128i mask, hev, flat, flat2; + __m128i flat2_pq[6], flat_pq[3]; + __m128i qs0ps0, qs1ps1; + __m128i p1p0, q1q0, qs1qs0, ps1ps0; + __m128i abs_p1p0; - *q2p2 = _mm_andnot_si128(flat, *q2p2); - flat_q2p2 = _mm_and_si128(flat, flat_q2p2); - *q2p2 = _mm_or_si128(*q2p2, flat_q2p2); + p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1); + q1q0 = _mm_srli_si128(p1p0, 8); - qs1ps1 = _mm_andnot_si128(flat, qs1ps1); - flat_q1p1 = _mm_and_si128(flat, flat_q1p1); - *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); + __m128i fe, ff, work; + { + __m128i abs_p1q1, abs_p0q0, abs_q1q0; + abs_p1p0 = abs_diff(*q1p1, *q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + fe = _mm_set1_epi8(0xfe); + ff = _mm_cmpeq_epi8(fe, fe); + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); - qs0ps0 = _mm_andnot_si128(flat, qs0ps0); - flat_q0p0 = _mm_and_si128(flat, flat_q0p0); - *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - *q5p5 = _mm_andnot_si128(flat2, *q5p5); - flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); - *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); - *q4p4 = _mm_andnot_si128(flat2, *q4p4); - flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); - *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; - *q3p3 = _mm_andnot_si128(flat2, *q3p3); - flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); - *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3); + work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2)); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + } - *q2p2 = _mm_andnot_si128(flat2, *q2p2); - flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); - *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2); + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0); + qs1ps1 = _mm_srli_si128(qs0ps0, 8); + // loopfilter done - *q1p1 = _mm_andnot_si128(flat2, *q1p1); - flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); - *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1); + flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // if flat ==0 then flat2 is zero as well and we don't need any calc below + // sse4.1 if (0==_mm_test_all_zeros(flat,ff)) + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; + __m128i pq_16[7]; + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + __m128i sum_p6; + __m128i sum_p3; + + pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero); + pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero); + pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero); + pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero); + pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero); + pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero); + pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero); + q0_16 = _mm_srli_si128(pq_16[0], 8); + q1_16 = _mm_srli_si128(pq_16[1], 8); + q2_16 = _mm_srli_si128(pq_16[2], 8); + q3_16 = _mm_srli_si128(pq_16[3], 8); + q4_16 = _mm_srli_si128(pq_16[4], 8); + q5_16 = _mm_srli_si128(pq_16[5], 8); + + __m128i flat_p[3], flat_q[3]; + __m128i flat2_p[6], flat2_q[6]; + + __m128i work0, work0_0, work0_1, sum_p_0; + __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3])); + __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1])); + sum_p = _mm_add_epi16(sum_p, sum_lp); + + __m128i sum_lq = _mm_srli_si128(sum_lp, 8); + __m128i sum_q = _mm_srli_si128(sum_p, 8); + + sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q)); + sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq)); + + flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0])); + flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16)); + + sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]); + sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]); + + sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]); + sum_p = _mm_sub_epi16(sum_p_0, q5_16); + + work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]); + work0_1 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0]))); + + sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]); + sum_lp = _mm_sub_epi16(sum_lp, q2_16); + + work0 = _mm_add_epi16(sum_p3, pq_16[1]); + flat_p[1] = _mm_add_epi16(sum_lp, work0); + flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + + flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3); + flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3); + flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]); + flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]); + + sum_lp = _mm_sub_epi16(sum_lp, q1_16); + sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]); + + sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]); + work0 = _mm_add_epi16(sum_p3, pq_16[2]); + + flat_p[2] = _mm_add_epi16(sum_lp, work0); + flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8)); + flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3); + flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]); + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0)); - *q0p0 = _mm_andnot_si128(flat2, *q0p0); - flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); - *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0); + work = abs_diff(*q6p6, *q0p0); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4)); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + flat2 = _mm_unpacklo_epi32(flat2, flat2); + + // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + qs0ps0 = _mm_andnot_si128(flat, qs0ps0); + flat_pq[0] = _mm_and_si128(flat, flat_pq[0]); + *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]); + + qs1ps1 = _mm_andnot_si128(flat, qs1ps1); + flat_pq[1] = _mm_and_si128(flat, flat_pq[1]); + *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]); + + *q2p2 = _mm_andnot_si128(flat, *q2p2); + flat_pq[2] = _mm_and_si128(flat, flat_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]); + + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) { + flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16)); + flat2_q[0] = _mm_add_epi16( + sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0])); + + flat2_p[1] = _mm_add_epi16(sum_p, work0_1); + flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8)); + + flat2_pq[0] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4); + flat2_pq[1] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4); + flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]); + flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]); + + sum_p = _mm_sub_epi16(sum_p, q4_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1]))); + flat2_p[2] = _mm_add_epi16(sum_p, work0); + flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[2] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4); + flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q3_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[3]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2]))); + flat2_p[3] = _mm_add_epi16(sum_p, work0); + flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[3] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4); + flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q2_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[2]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3]))); + flat2_p[4] = _mm_add_epi16(sum_p, work0); + flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[4] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4); + flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]); + + sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]); + sum_p = _mm_sub_epi16(sum_p, q1_16); + sum_q = _mm_sub_epi16(sum_q, pq_16[1]); + + work0 = _mm_add_epi16( + sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4]))); + flat2_p[5] = _mm_add_epi16(sum_p, work0); + flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8)); + flat2_pq[5] = + _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4); + flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]); + + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + *q0p0 = _mm_andnot_si128(flat2, *q0p0); + flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]); + *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]); + + *q1p1 = _mm_andnot_si128(flat2, *q1p1); + flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]); + *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]); + + *q2p2 = _mm_andnot_si128(flat2, *q2p2); + flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]); + *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]); + + *q3p3 = _mm_andnot_si128(flat2, *q3p3); + flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]); + *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]); + + *q4p4 = _mm_andnot_si128(flat2, *q4p4); + flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]); + *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]); + + *q5p5 = _mm_andnot_si128(flat2, *q5p5); + flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]); + *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]); + } + } else { + *q0p0 = qs0ps0; + *q1p1 = qs1ps1; + } } void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, @@ -761,22 +1241,22 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); - q4p4 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), + q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)), _mm_cvtsi32_si128(*(int *)(s + 4 * p))); - q3p3 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), + q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)), _mm_cvtsi32_si128(*(int *)(s + 3 * p))); - q2p2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), + q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)), _mm_cvtsi32_si128(*(int *)(s + 2 * p))); - q1p1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), + q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)), _mm_cvtsi32_si128(*(int *)(s + 1 * p))); - q0p0 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), + q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)), _mm_cvtsi32_si128(*(int *)(s - 0 * p))); - q5p5 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), + q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)), _mm_cvtsi32_si128(*(int *)(s + 5 * p))); - q6p6 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), + q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)), _mm_cvtsi32_si128(*(int *)(s + 6 * p))); lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, @@ -790,7 +1270,7 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p, store_buffer_horz_8(q5p5, p, 5, s); } -static AOM_FORCE_INLINE void lpf_internal_6_sse2( +static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2( __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, __m128i *thresh) { @@ -810,6 +1290,7 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( const __m128i one = _mm_set1_epi8(1); const __m128i fe = _mm_set1_epi8(0xfe); const __m128i ff = _mm_cmpeq_epi8(fe, fe); + { // filter_mask and hev_mask __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; @@ -847,8 +1328,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - // replicate for the further "merged variables" usage - mask = _mm_unpacklo_epi64(mask, mask); + + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); // flat_mask flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); @@ -861,9 +1343,9 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( } // 5 tap filter - { + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_shft0, workp_shft1; p2_16 = _mm_unpacklo_epi8(*p2, zero); p1_16 = _mm_unpacklo_epi8(*p1, zero); @@ -906,18 +1388,149 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2( 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); } +} - // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(p1p0, q1q0, &hev, &mask, &qs1qs0, &ps1ps0); +static AOM_FORCE_INLINE void lpf_internal_6_sse2( + __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, + __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit, + __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1; + __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16; + __m128i ps1ps0, qs1qs0; + + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + *p1p0 = _mm_unpacklo_epi32(*p0, *p1); + *q1q0 = _mm_unpacklo_epi32(*q0, *q1); + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + { + // filter_mask and hev_mask + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(*p1p0, *q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + // considering sse doesn't have unsigned elements comparison the idea is + // to find at least one case when X > limit, it means the corresponding + // mask bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - *q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - *p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + work = abs_diff(q2p2, q1p1); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0); + + // flat_mask + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + } + + // 5 tap filter + // need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c; + __m128i pq0x2_pq1, pq1_pq2; + pq2_16 = _mm_unpacklo_epi8(q2p2, zero); + pq1_16 = _mm_unpacklo_epi8(q1p1, zero); + pq0_16 = _mm_unpacklo_epi8(q0p0, zero); + q0_16 = _mm_srli_si128(pq0_16, 8); + q2_16 = _mm_srli_si128(pq2_16, 8); + + // op1 + pq0x2_pq1 = + _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1 + pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2 + workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four), + pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4 + + workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16); + workp_b = + _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4 + + // op0 + workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1 + workp_a = _mm_add_epi16(workp_a, + workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4 + workp_b = _mm_unpacklo_epi64(workp_a, workp_b); + workp_b = _mm_srli_epi16(workp_b, 3); + + flat_p1p0 = _mm_packus_epi16(workp_b, workp_b); + + // oq0 + workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16), + pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4 + workp_b = _mm_srli_si128(pq1_pq2, 8); + workp_a = _mm_add_epi16( + workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4 + // workp_shft0 = _mm_srli_epi16(workp_a, 3); + + // oq1 + workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16), + pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4 + workp_b = _mm_add_epi16(q2_16, q2_16); + workp_b = + _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4 + + workp_a = _mm_unpacklo_epi64(workp_a, workp_b); + workp_a = _mm_srli_epi16(workp_a, 3); + + flat_q0q1 = _mm_packus_epi16(workp_a, workp_a); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0); + *q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0 = _mm_or_si128(qs1qs0, *q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0); + *p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0 = _mm_or_si128(ps1ps0, *p1p0); + } } void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, @@ -941,9 +1554,9 @@ void aom_lpf_horizontal_6_sse2(unsigned char *s, int p, &limit, &thresh); xx_storel_32(s - 1 * p, p1p0); - xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); xx_storel_32(s + 0 * p, q1q0); - xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); } void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, @@ -970,8 +1583,8 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, - &limit, &thresh); + lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit, + &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); @@ -982,15 +1595,168 @@ void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p, static AOM_FORCE_INLINE void lpf_internal_8_sse2( __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, - __m128i *p2_out, __m128i *q2_out, __m128i *blimit, __m128i *limit, - __m128i *thresh) { + __m128i *blimit, __m128i *limit, __m128i *thresh) { const __m128i zero = _mm_setzero_si128(); __m128i mask, hev, flat; __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, flat_p1p0, flat_q0q1; __m128i q2p2, q1p1, q0p0; __m128i q1q0, p1p0, ps1ps0, qs1qs0; - __m128i work_a, op2, oq2; + __m128i work_pq, opq2, pq2; + + q3p3 = _mm_unpacklo_epi32(*p3, *q3); + q2p2 = _mm_unpacklo_epi32(*p2, *q2); + q1p1 = _mm_unpacklo_epi32(*p1, *q1); + q0p0 = _mm_unpacklo_epi32(*p0, *q0); + + p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0 + q1q0 = _mm_srli_si128(p1p0, 8); + + // filter_mask and hev_mask + + // considering sse doesn't have unsigned elements comparison the idea is to + // find at least one case when X > limit, it means the corresponding mask + // bit is set. + // to achieve that we find global max value of all inputs of abs(x-y) or + // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set + // otherwise - not + + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + + abs_p1p0 = abs_diff(q1p1, q0p0); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 4); + + abs_p0q0 = abs_diff(p1p0, q1q0); + abs_p1q1 = _mm_srli_si128(abs_p0q0, 4); + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, *thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + // replicate for the further "merged variables" usage + hev = _mm_unpacklo_epi32(hev, hev); + + abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit); + mask = _mm_unpacklo_epi32(mask, zero); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); + + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4)); + mask = _mm_subs_epu8(mask, *limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // lp filter - the same for 6, 8 and 14 versions + filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + + // flat_mask4 + flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); + flat = _mm_max_epu8(abs_p1p0, flat); + + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + // replicate for the further "merged variables" usage + flat = _mm_unpacklo_epi32(flat, flat); + flat = _mm_unpacklo_epi64(flat, flat); + + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { + const __m128i four = _mm_set1_epi16(4); + __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2; + p2_16 = _mm_unpacklo_epi8(*p2, zero); + p1_16 = _mm_unpacklo_epi8(*p1, zero); + p0_16 = _mm_unpacklo_epi8(*p0, zero); + q0_16 = _mm_unpacklo_epi8(*q0, zero); + q1_16 = _mm_unpacklo_epi8(*q1, zero); + q2_16 = _mm_unpacklo_epi8(*q2, zero); + p3_16 = _mm_unpacklo_epi8(*p3, zero); + q3_16 = _mm_unpacklo_epi8(*q3, zero); + + // op2 + workp_a = + _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); + workp_shft2 = _mm_add_epi16(workp_a, workp_b); + + // op1 + workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); + workp_c = _mm_add_epi16(workp_a, workp_b); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + // op0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_d, workp_c); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_p1p0 = _mm_packus_epi16(workp_c, workp_c); + + // oq0 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16); + // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + workp_c = _mm_add_epi16(workp_a, workp_b); + + // oq1 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16); + workp_d = _mm_add_epi16(workp_a, workp_b); + // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + + workp_c = _mm_unpacklo_epi64(workp_c, workp_d); + workp_c = _mm_srli_epi16(workp_c, 3); + flat_q0q1 = _mm_packus_epi16(workp_c, workp_c); + + // oq2 + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); + workp_shft1 = _mm_add_epi16(workp_a, workp_b); + + workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1); + workp_c = _mm_srli_epi16(workp_c, 3); + + opq2 = _mm_packus_epi16(workp_c, workp_c); + + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 4); + + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } +} + +static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2( + __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1, + __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out, + __m128i *blimit, __m128i *limit, __m128i *thresh) { + const __m128i zero = _mm_setzero_si128(); + __m128i mask, hev, flat; + __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3, + flat_p1p0, flat_q0q1; + __m128i q2p2, q1p1, q0p0; + __m128i q1q0, p1p0, ps1ps0, qs1qs0; + __m128i work_pq, opq2, pq2; q3p3 = _mm_unpacklo_epi64(*p3, *q3); q2p2 = _mm_unpacklo_epi64(*p2, *q2); @@ -1043,11 +1809,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, *limit); mask = _mm_cmpeq_epi8(mask, zero); - // replicate for the further "merged variables" usage - mask = _mm_unpacklo_epi64(mask, mask); - // flat_mask4 + // lp filter - the same for 6, 8 and 14 versions + filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out); + // flat_mask4 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); flat = _mm_max_epu8(abs_p1p0, flat); @@ -1059,11 +1825,11 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( flat = _mm_unpacklo_epi64(flat, flat); } - // filter8 - { + // filter8 need it only if flat !=0 + if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) { const __m128i four = _mm_set1_epi16(4); - __m128i workp_a, workp_b, workp_shft0, workp_shft1; + __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2; p2_16 = _mm_unpacklo_epi8(*p2, zero); p1_16 = _mm_unpacklo_epi8(*p1, zero); p0_16 = _mm_unpacklo_epi8(*p0, zero); @@ -1078,8 +1844,7 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16)); workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16); workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16); - workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - op2 = _mm_packus_epi16(workp_shft0, workp_shft0); + workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); // op1 workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16); @@ -1108,27 +1873,22 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2( workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16); workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - oq2 = _mm_packus_epi16(workp_shft1, workp_shft1); - } - // lp filter - the same for 6, 8 and 14 versions - filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0); + opq2 = _mm_packus_epi16(workp_shft2, workp_shft1); - qs1qs0 = _mm_andnot_si128(flat, qs1qs0); - q1q0 = _mm_and_si128(flat, flat_q0q1); - *q1q0_out = _mm_or_si128(qs1qs0, q1q0); + work_pq = _mm_andnot_si128(flat, q2p2); + pq2 = _mm_and_si128(flat, opq2); + *p2 = _mm_or_si128(work_pq, pq2); + *q2 = _mm_srli_si128(*p2, 8); - ps1ps0 = _mm_andnot_si128(flat, ps1ps0); - p1p0 = _mm_and_si128(flat, flat_p1p0); - *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + qs1qs0 = _mm_andnot_si128(flat, *q1q0_out); + q1q0 = _mm_and_si128(flat, flat_q0q1); + *q1q0_out = _mm_or_si128(qs1qs0, q1q0); - work_a = _mm_andnot_si128(flat, *q2); - q2_16 = _mm_and_si128(flat, oq2); - *q2_out = _mm_or_si128(work_a, q2_16); - - work_a = _mm_andnot_si128(flat, *p2); - p2_16 = _mm_and_si128(flat, op2); - *p2_out = _mm_or_si128(work_a, p2_16); + ps1ps0 = _mm_andnot_si128(flat, *p1p0_out); + p1p0 = _mm_and_si128(flat, flat_p1p0); + *p1p0_out = _mm_or_si128(ps1ps0, p1p0); + } } void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, @@ -1136,7 +1896,7 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, const unsigned char *_limit, const unsigned char *_thresh) { __m128i p2, p1, p0, q0, q1, q2, p3, q3; - __m128i q1q0, p1p0, p2_out, q2_out; + __m128i q1q0, p1p0; __m128i blimit = _mm_load_si128((const __m128i *)_blimit); __m128i limit = _mm_load_si128((const __m128i *)_limit); __m128i thresh = _mm_load_si128((const __m128i *)_thresh); @@ -1151,14 +1911,14 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p, q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p)); lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, - &p2_out, &q2_out, &blimit, &limit, &thresh); + &blimit, &limit, &thresh); xx_storel_32(s - 1 * p, p1p0); - xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 8)); + xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4)); xx_storel_32(s + 0 * p, q1q0); - xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 8)); - xx_storel_32(s - 3 * p, p2_out); - xx_storel_32(s + 2 * p, q2_out); + xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4)); + xx_storel_32(s - 3 * p, p2); + xx_storel_32(s + 2 * p, q2); } void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, @@ -1196,8 +1956,8 @@ void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p, q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)), _mm_loadl_epi64((__m128i *)(s + 6 * p))); - lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, - &limit, &thresh); + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8)); @@ -1227,7 +1987,7 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, _mm_load_si128((__m128i *)_thresh1)); __m128i p2, p1, p0, q0, q1, q2, p3, q3; - __m128i q1q0, p1p0, p2_out, q2_out; + __m128i q1q0, p1p0; p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); @@ -1238,15 +1998,15 @@ void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); - lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, - &p2_out, &q2_out, &blimit, &limit, &thresh); + lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, + &blimit, &limit, &thresh); _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8)); _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0); _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8)); - _mm_storel_epi64((__m128i *)(s - 3 * p), p2_out); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2_out); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); } void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, @@ -1282,7 +2042,7 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, __m128i t = _mm_unpacklo_epi64(thresh0, thresh1); - lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8)); @@ -1331,7 +2091,7 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0, &q1); - lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); + lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0); p1 = _mm_srli_si128(ps1ps0, 8); q1 = _mm_srli_si128(qs1qs0, 8); @@ -1372,8 +2132,8 @@ void aom_lpf_vertical_6_sse2(unsigned char *s, int p, lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit, &limit, &thresh); - p0 = _mm_srli_si128(p1p0, 8); - q0 = _mm_srli_si128(q1q0, 8); + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3); @@ -1419,8 +2179,8 @@ void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, d5 = _mm_srli_si128(d4d5, 8); d7 = _mm_srli_si128(d6d7, 8); - lpf_internal_6_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, &blimit, - &limit, &thresh); + lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0, + &blimit, &limit, &thresh); p0 = _mm_srli_si128(p1p0, 8); q0 = _mm_srli_si128(q1q0, 8); @@ -1444,7 +2204,7 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p, const unsigned char *_thresh) { __m128i d0, d1, d2, d3, d4, d5, d6, d7; - __m128i p2, p0, q0, q2; + __m128i p0, q0; __m128i x2, x1, x0, x3; __m128i q1q0, p1p0; __m128i blimit = _mm_load_si128((const __m128i *)_blimit); @@ -1459,13 +2219,13 @@ void aom_lpf_vertical_8_sse2(unsigned char *s, int p, transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); // Loop filtering - lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, &p2, - &q2, &blimit, &limit, &thresh); + lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0, + &blimit, &limit, &thresh); - p0 = _mm_srli_si128(p1p0, 8); - q0 = _mm_srli_si128(q1q0, 8); + p0 = _mm_srli_si128(p1p0, 4); + q0 = _mm_srli_si128(q1q0, 4); - transpose8x8_low_sse2(&d0, &p2, &p0, &p1p0, &q1q0, &q0, &q2, &d7, &d0, &d1, + transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1, &d2, &d3); _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0); @@ -1490,7 +2250,7 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i d1, d3, d5, d7; __m128i q1q0, p1p0; - __m128i p2, p1, q1, q2; + __m128i p1, q1; __m128i d0d1, d2d3, d4d5, d6d7; x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p)); @@ -1510,14 +2270,14 @@ void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, d5 = _mm_srli_si128(d4d5, 8); d7 = _mm_srli_si128(d6d7, 8); - lpf_internal_8_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, &q1q0, - &p1p0, &p2, &q2, &blimit, &limit, &thresh); + lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5, + &q1q0, &p1p0, &blimit, &limit, &thresh); p1 = _mm_srli_si128(p1p0, 8); q1 = _mm_srli_si128(q1q0, 8); - transpose8x8_sse2(&d0d1, &p2, &p1, &p1p0, &q1q0, &q1, &q2, &d7, &d0d1, &d2d3, - &d4d5, &d6d7); + transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1, + &d2d3, &d4d5, &d6d7); _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1); _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8)); @@ -1533,65 +2293,30 @@ void aom_lpf_vertical_14_sse2(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; - __m128i x6, x5, x4, x3, x2, x1, x0; - __m128i p0, p1, p2, p3, p4, p5, p6, p7; - __m128i q0, q1, q2, q3, q4, q5, q6, q7; - __m128i p0_out, p1_out, p2_out, p3_out; + __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0; + __m128i x6, x5, x4, x3; + __m128i pq0, pq1, pq2, pq3; __m128i blimit = _mm_load_si128((__m128i *)_blimit); __m128i limit = _mm_load_si128((__m128i *)_limit); __m128i thresh = _mm_load_si128((__m128i *)_thresh); - x6 = _mm_loadl_epi64((__m128i *)((s - 8) + 0 * p)); - x5 = _mm_loadl_epi64((__m128i *)((s - 8) + 1 * p)); - x4 = _mm_loadl_epi64((__m128i *)((s - 8) + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)((s - 8) + 3 * p)); - - transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &p0, &p1, &p2, &p3, &p4, &p5, &p6, - &p7); + x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p)); + x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p)); + x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p)); + x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p)); - x6 = _mm_loadl_epi64((__m128i *)(s + 0 * p)); - x5 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - x4 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - x3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); - - transpose4x8_8x4_sse2(&x6, &x5, &x4, &x3, &q0, &q1, &q2, &q3, &q4, &q5, &q6, - &q7); - - q6p6 = _mm_unpacklo_epi64(p1, q6); - q5p5 = _mm_unpacklo_epi64(p2, q5); - q4p4 = _mm_unpacklo_epi64(p3, q4); - q3p3 = _mm_unpacklo_epi64(p4, q3); - q2p2 = _mm_unpacklo_epi64(p5, q2); - q1p1 = _mm_unpacklo_epi64(p6, q1); - q0p0 = _mm_unpacklo_epi64(p7, q0); + transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4, + &q5p5, &q6p6, &q7p7); lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, &limit, &thresh); - transpose8x8_low_sse2(&p0, &p1, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, - &p0_out, &p1_out, &p2_out, &p3_out); - - x0 = _mm_srli_si128(q0p0, 8); - x1 = _mm_srli_si128(q1p1, 8); - x2 = _mm_srli_si128(q2p2, 8); - x3 = _mm_srli_si128(q3p3, 8); - x4 = _mm_srli_si128(q4p4, 8); - x5 = _mm_srli_si128(q5p5, 8); - x6 = _mm_srli_si128(q6p6, 8); - - transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &q0, &q1, &q2, - &q3); - - _mm_storel_epi64((__m128i *)(s - 8 + 0 * p), p0_out); - _mm_storel_epi64((__m128i *)(s - 8 + 1 * p), p1_out); - _mm_storel_epi64((__m128i *)(s - 8 + 2 * p), p2_out); - _mm_storel_epi64((__m128i *)(s - 8 + 3 * p), p3_out); - - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); - _mm_storel_epi64((__m128i *)(s + 3 * p), q3); + transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, + &q0p0, &pq0, &pq1, &pq2, &pq3); + _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0); + _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1); + _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2); + _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3); } void aom_lpf_vertical_14_dual_sse2( @@ -1634,8 +2359,8 @@ void aom_lpf_vertical_14_dual_sse2( q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8)); q7 = _mm_srli_si128(d14d15, 8); - lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit, - &limit, &thresh); + lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, + &blimit, &limit, &thresh); x0 = _mm_srli_si128(q0p0, 8); x1 = _mm_srli_si128(q1p1, 8); diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h index c6b6469b4..8970fe7dd 100644 --- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h +++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_LPF_COMMON_X86_H -#define _AOM_DSP_X86_LPF_COMMON_X86_H +#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ #include // SSE2 @@ -212,4 +212,4 @@ static INLINE void highbd_transpose8x16_sse2( d4 + 1, d5 + 1, d6 + 1, d7 + 1); } -#endif // _AOM_DSP_X86_LPF_COMMON_X86_H +#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c index 6538e4d5e..584b5e7e3 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c @@ -9,7 +9,6 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include #include #include "config/aom_config.h" diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h index 19b429d91..cffbd9672 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h +++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H -#define _AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H +#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, @@ -30,4 +30,4 @@ unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, const uint8_t *m_ptr, int m_stride, int height); -#endif +#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h index dc41a8342..4faa098ac 100644 --- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h +++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H -#define _AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H +#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ +#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ #include #include @@ -89,4 +89,4 @@ static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height, } while (i < height); } -#endif +#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h index 8b69606dd..6c821673e 100644 --- a/third_party/aom/aom_dsp/x86/mem_sse2.h +++ b/third_party/aom/aom_dsp/x86/mem_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_MEM_SSE2_H_ -#define AOM_DSP_X86_MEM_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_ +#define AOM_AOM_DSP_X86_MEM_SSE2_H_ #include // SSE2 @@ -39,4 +39,4 @@ static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, return dst; } -#endif // AOM_DSP_X86_MEM_SSE2_H_ +#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h new file mode 100644 index 000000000..5181e444c --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ + +#include + +#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" + +static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int h) { + const int pre_step = pre_stride - 4; + int n = 0; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + + assert(IS_POWER_OF_TWO(h)); + + do { + const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n)); + const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n)); + const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n)); + + const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); + + const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); + const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); + const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + n += 4; + + if (n % 4 == 0) pre += pre_step; + } while (n < 4 * h); + + *sum = xx_hsum_epi32_si32(v_sum_d); + *sse = xx_hsum_epi32_si32(v_sse_d); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h index a3535f985..48486c6c4 100644 --- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h +++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ -#define AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ #include @@ -42,4 +42,13 @@ static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) { return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q)); } -#endif // AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ +// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) +static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { + const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); + const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); + const __m128i v_tmp_d = + _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); + return _mm_srai_epi32(v_tmp_d, bits); +} + +#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_ diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c new file mode 100644 index 000000000..bfec0e8a8 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" + +//////////////////////////////////////////////////////////////////////////////// +// 8 bit +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m128i v_sum_d = _mm_setzero_si128(); + __m128i v_sse_d = _mm_setzero_si128(); + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m128i v_d; + const uint8_t *pre_temp; + assert(w >= 8); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp); + const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + + // Values in both pre and mask fit in 15 bits, and are packed at 32 bit + // boundaries. We use pmaddwd, as it has lower latency on Haswell + // than pmulld but produces the same result with these inputs. + const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d); + const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d); + + const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_tmp_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d); + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12); + const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d); + const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1); + + const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d); + const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); + v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 8; + n += 8; + width -= 8; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + v_d = _mm_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm_hadd_epi32(v_d, v_d); + *sum = _mm_cvtsi128_si32(v_d); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4)); +} + +static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride, + const int32_t *wsrc, const int32_t *mask, + unsigned int *const sse, int *const sum, + const int w, const int h) { + int n = 0, width, height = h; + __m256i v_d; + __m128i res0; + const uint8_t *pre_temp; + const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1); + __m256i v_sum_d = _mm256_setzero_si256(); + __m256i v_sse_d = _mm256_setzero_si256(); + + assert(w >= 16); + assert(IS_POWER_OF_TWO(w)); + assert(IS_POWER_OF_TWO(h)); + do { + width = w; + pre_temp = pre; + do { + const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp); + const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n)); + const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n)); + const __m256i v_m1_d = + _mm256_loadu_si256((__m256i const *)(mask + n + 8)); + const __m256i v_w1_d = + _mm256_loadu_si256((__m256i const *)(wsrc + n + 8)); + + const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b); + const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8)); + + const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d); + const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d); + + const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d); + const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d); + + const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31); + const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31); + + const __m256i v_tmp0_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d); + const __m256i v_tmp1_d = + _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d); + + const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12); + const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12); + + const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d); + const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w); + + v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d); + v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d); + + pre_temp += 16; + n += 16; + width -= 16; + } while (width > 0); + pre += pre_stride; + height -= 1; + } while (height > 0); + + v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d); + v_d = _mm256_hadd_epi32(v_d, v_d); + res0 = _mm256_castsi256_si128(v_d); + res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1)); + *sum = _mm_cvtsi128_si32(res0); + *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4)); +} + +#define OBMCVARWXH(W, H) \ + unsigned int aom_obmc_variance##W##x##H##_avx2( \ + const uint8_t *pre, int pre_stride, const int32_t *wsrc, \ + const int32_t *mask, unsigned int *sse) { \ + int sum; \ + if (W == 4) { \ + obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \ + } else if (W == 8) { \ + obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } else { \ + obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \ + } \ + \ + return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ + } + +OBMCVARWXH(128, 128) +OBMCVARWXH(128, 64) +OBMCVARWXH(64, 128) +OBMCVARWXH(64, 64) +OBMCVARWXH(64, 32) +OBMCVARWXH(32, 64) +OBMCVARWXH(32, 32) +OBMCVARWXH(32, 16) +OBMCVARWXH(16, 32) +OBMCVARWXH(16, 16) +OBMCVARWXH(16, 8) +OBMCVARWXH(8, 16) +OBMCVARWXH(8, 8) +OBMCVARWXH(8, 4) +OBMCVARWXH(4, 8) +OBMCVARWXH(4, 4) +OBMCVARWXH(4, 16) +OBMCVARWXH(16, 4) +OBMCVARWXH(8, 32) +OBMCVARWXH(32, 8) +OBMCVARWXH(16, 64) +OBMCVARWXH(64, 16) diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c index 2e2f6e09f..72eda0e57 100644 --- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c +++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c @@ -19,7 +19,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/obmc_intrinsic_ssse3.h" +#include "aom_dsp/x86/obmc_intrinsic_sse4.h" #include "aom_dsp/x86/synonyms.h" //////////////////////////////////////////////////////////////////////////////// @@ -36,45 +36,6 @@ void aom_var_filter_block2d_bil_second_pass_ssse3( unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter); -static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride, - const int32_t *wsrc, const int32_t *mask, - unsigned int *const sse, int *const sum, - const int h) { - const int pre_step = pre_stride - 4; - int n = 0; - __m128i v_sum_d = _mm_setzero_si128(); - __m128i v_sse_d = _mm_setzero_si128(); - - assert(IS_POWER_OF_TWO(h)); - - do { - const __m128i v_p_b = xx_loadl_32(pre + n); - const __m128i v_m_d = xx_load_128(mask + n); - const __m128i v_w_d = xx_load_128(wsrc + n); - - const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); - - // Values in both pre and mask fit in 15 bits, and are packed at 32 bit - // boundaries. We use pmaddwd, as it has lower latency on Haswell - // than pmulld but produces the same result with these inputs. - const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); - - const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); - const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12); - const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d); - - v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d); - v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d); - - n += 4; - - if (n % 4 == 0) pre += pre_step; - } while (n < 4 * h); - - *sum = xx_hsum_epi32_si32(v_sum_d); - *sse = xx_hsum_epi32_si32(v_sse_d); -} - static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *const sse, int *const sum, diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm index e6b40262d..216a0bd8f 100644 --- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm @@ -16,16 +16,12 @@ SECTION .text %macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan vzeroupper - ; If we can skip this block, then just zero the output - cmp skipmp, 0 - jne .blank - %ifnidn %1, b_32x32 ; Special case for ncoeff == 16, as it is frequent and we can save on @@ -83,14 +79,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ .single_nonzero: ; Actual quantization of size 16 block - setup pointers, rounders, etc. - movifnidn r4, roundmp - movifnidn r5, quantmp - mov r3, dequantmp - mov r6, shiftmp - mova m1, [r4] ; m1 = round - mova m2, [r5] ; m2 = quant - mova m3, [r3] ; m3 = dequant - mova m4, [r6] ; m4 = shift + movifnidn r3, roundmp + movifnidn r4, quantmp + mov r6, dequantmp + mov r5, shiftmp + mova m1, [r3] ; m1 = round + mova m2, [r4] ; m2 = quant + mova m3, [r6] ; m3 = dequant + mova m4, [r5] ; m4 = shift mov r3, iscanmp @@ -174,20 +170,20 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ %endif ; %ifnidn %1, b_32x32 -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ +DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ qcoeff, dqcoeff, dequant, eob, scan, iscan ; Actual quantization loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp + movifnidn dequantq, dequantmp mova m0, [zbinq] ; m0 = zbin mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant - mova m3, [r2] ; m3 = dequant + mova m3, [dequantq] ; m3 = dequant pcmpeqw m4, m4 ; All lanes -1 %ifidn %1, b_32x32 psubw m0, m4 @@ -199,7 +195,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ mov r2, shiftmp mov r3, qcoeffmp - mova m4, [r2] ; m4 = shift + mova m4, [r2] ; m4 = shift mov r4, dqcoeffmp mov r5, iscanmp %ifidn %1, b_32x32 @@ -207,7 +203,7 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ %endif pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob lea coeffq, [ coeffq+ncoeffq*4] @@ -432,39 +428,8 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ mov [r2], ax vzeroupper RET - - ; Skip-block, i.e. just write all zeroes -.blank: - -DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ - qcoeff, dqcoeff, dequant, eob, scan, iscan - - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - -DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - neg ncoeffq - pxor m7, m7 - -.blank_loop: - mova [dqcoeffq+ncoeffq*4+ 0], ymm7 - mova [dqcoeffq+ncoeffq*4+32], ymm7 - mova [qcoeffq+ncoeffq*4+ 0], ymm7 - mova [qcoeffq+ncoeffq*4+32], ymm7 - add ncoeffq, mmsize - jl .blank_loop - - mov [eobq], word 0 - - vzeroupper - RET %endmacro INIT_XMM avx -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c index 46b9c7d29..d3de6e24d 100644 --- a/third_party/aom/aom_dsp/x86/quantize_sse2.c +++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c @@ -9,242 +9,139 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include #include #include #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { - if (sizeof(tran_low_t) == 4) { - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); - } else { - return _mm_load_si128((const __m128i *)coeff_ptr); - } + assert(sizeof(tran_low_t) == 4); + + return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); } static INLINE void store_coefficients(__m128i coeff_vals, tran_low_t *coeff_ptr) { - if (sizeof(tran_low_t) == 4) { - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); - } else { - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); - } + assert(sizeof(tran_low_t) == 4); + + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); } void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *zbin_ptr, - const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *zbin_ptr, const int16_t *round_ptr, + const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { - __m128i zero; + const __m128i zero = _mm_setzero_si128(); + int index = 16; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob, eob0; + (void)scan_ptr; - coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; - qcoeff_ptr += n_coeffs; - dqcoeff_ptr += n_coeffs; - n_coeffs = -n_coeffs; - zero = _mm_setzero_si128(); - if (!skip_block) { - __m128i eob; - __m128i zbin; - __m128i round, quant, dequant, shift; - { - __m128i coeff0, coeff1; - - // Setup global values - { - __m128i pw_1; - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - pw_1 = _mm_set1_epi16(1); - zbin = _mm_sub_epi16(zbin, pw_1); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - } - - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - // Do DC and first 15 AC - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - shift = _mm_unpackhi_epi64(shift, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob = _mm_max_epi16(eob, eob1); - } - n_coeffs += 8 * 2; - } - - // AC only loop - while (n_coeffs < 0) { - __m128i coeff0, coeff1; - { - __m128i coeff0_sign, coeff1_sign; - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); - - // Poor man's sign extract - coeff0_sign = _mm_srai_epi16(coeff0, 15); - coeff1_sign = _mm_srai_epi16(coeff1, 15); - qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign); - qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign); - qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); - qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); - } - - { - // Scan for eob - __m128i zero_coeff0, zero_coeff1; - __m128i nzero_coeff0, nzero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); - nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); - eob0 = _mm_and_si128(iscan0, nzero_coeff0); - eob1 = _mm_and_si128(iscan1, nzero_coeff1); - eob0 = _mm_max_epi16(eob0, eob1); - eob = _mm_max_epi16(eob, eob0); - } - n_coeffs += 8 * 2; - } - - // Accumulate EOB - { - __m128i eob_shuffled; - eob_shuffled = _mm_shuffle_epi32(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); - eob = _mm_max_epi16(eob, eob_shuffled); - eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); - eob = _mm_max_epi16(eob, eob_shuffled); - *eob_ptr = _mm_extract_epi16(eob, 1); - } - } else { - do { - store_coefficients(zero, dqcoeff_ptr + n_coeffs); - store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); - store_coefficients(zero, qcoeff_ptr + n_coeffs); - store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); - n_coeffs += 8 * 2; - } while (n_coeffs < 0); - *eob_ptr = 0; + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr); + store_coefficients(coeff1, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero); + + // AC only loop. + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + index); + store_coefficients(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, + index, zero); + eob = _mm_max_epi16(eob, eob0); + + index += 16; } + + *eob_ptr = accumulate_eob(eob); } diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm index e2c1ebb71..39d4ca674 100644 --- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm +++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm @@ -18,21 +18,18 @@ pw_1: times 8 dw 1 SECTION .text -; TODO(yunqingwang)fix quantize_b code for skip=1 case. %macro QUANTIZE_FN 2 -cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ shift, qcoeff, dqcoeff, dequant, \ eob, scan, iscan - cmp dword skipm, 0 - jne .blank ; actual quantize loop - setup pointers, rounders, etc. movifnidn coeffq, coeffmp movifnidn ncoeffq, ncoeffmp - mov r2, dequantmp movifnidn zbinq, zbinmp movifnidn roundq, roundmp movifnidn quantq, quantmp + movifnidn dequantq, dequantmp mova m0, [zbinq] ; m0 = zbin mova m1, [roundq] ; m1 = round mova m2, [quantq] ; m2 = quant @@ -44,18 +41,18 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ psrlw m0, 1 ; m0 = (m0 + 1) / 2 psrlw m1, 1 ; m1 = (m1 + 1) / 2 %endif - mova m3, [r2q] ; m3 = dequant - psubw m0, [GLOBAL(pw_1)] + mova m3, [dequantq] ; m3 = dequant mov r2, shiftmp - mov r3, qcoeffmp + psubw m0, [GLOBAL(pw_1)] mova m4, [r2] ; m4 = shift + mov r3, qcoeffmp mov r4, dqcoeffmp mov r5, iscanmp %ifidn %1, b_32x32 psllw m4, 1 %endif pxor m5, m5 ; m5 = dedicated zero - DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob lea coeffq, [ coeffq+ncoeffq*4] lea qcoeffq, [ qcoeffq+ncoeffq*4] lea dqcoeffq, [dqcoeffq+ncoeffq*4] @@ -268,33 +265,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pextrw r6, m8, 0 mov [r2], r6 RET - - ; skip-block, i.e. just write all zeroes -.blank: - mov r0, dqcoeffmp - movifnidn ncoeffq, ncoeffmp - mov r2, qcoeffmp - mov r3, eobmp - DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob - lea dqcoeffq, [dqcoeffq+ncoeffq*4] - lea qcoeffq, [ qcoeffq+ncoeffq*4] - neg ncoeffq - pxor m7, m7 -.blank_loop: - mova [dqcoeffq+ncoeffq*4+ 0], m7 - mova [dqcoeffq+ncoeffq*4+16], m7 - mova [dqcoeffq+ncoeffq*4+32], m7 - mova [dqcoeffq+ncoeffq*4+48], m7 - mova [qcoeffq+ncoeffq*4+ 0], m7 - mova [qcoeffq+ncoeffq*4+16], m7 - mova [qcoeffq+ncoeffq*4+32], m7 - mova [qcoeffq+ncoeffq*4+48], m7 - add ncoeffq, mmsize - jl .blank_loop - mov word [eobq], 0 - RET %endmacro INIT_XMM ssse3 -QUANTIZE_FN b, 7 -QUANTIZE_FN b_32x32, 7 +QUANTIZE_FN b, 9 +QUANTIZE_FN b_32x32, 9 diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h new file mode 100644 index 000000000..4eed7dd29 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/quantize_x86.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom/aom_integer.h" + +static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, + const int16_t *round_ptr, __m128i *round, + const int16_t *quant_ptr, __m128i *quant, + const int16_t *dequant_ptr, __m128i *dequant, + const int16_t *shift_ptr, __m128i *shift) { + *zbin = _mm_load_si128((const __m128i *)zbin_ptr); + *round = _mm_load_si128((const __m128i *)round_ptr); + *quant = _mm_load_si128((const __m128i *)quant_ptr); + *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1)); + *dequant = _mm_load_si128((const __m128i *)dequant_ptr); + *shift = _mm_load_si128((const __m128i *)shift_ptr); +} + +// With ssse3 and later abs() and sign() are preferred. +static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi16(a, sign); +} + +static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, + const __m128i quant, const __m128i shift) { + __m128i tmp, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + *coeff = _mm_mulhi_epi16(qcoeff, shift); +} + +static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { + return _mm_mullo_epi16(qcoeff, dequant); +} + +// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing +// to zbin to add 1 to the index in 'scan'. +static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, + const __m128i zbin_mask0, + const __m128i zbin_mask1, + const int16_t *scan_ptr, const int index, + const __m128i zero) { + const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); + const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero); + __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index)); + __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8)); + __m128i eob0, eob1; + // Add one to convert from indices to counts + scan0 = _mm_sub_epi16(scan0, zbin_mask0); + scan1 = _mm_sub_epi16(scan1, zbin_mask1); + eob0 = _mm_andnot_si128(zero_coeff0, scan0); + eob1 = _mm_andnot_si128(zero_coeff1, scan1); + return _mm_max_epi16(eob0, eob1); +} + +static INLINE int16_t accumulate_eob(__m128i eob) { + __m128i eob_shuffled; + eob_shuffled = _mm_shuffle_epi32(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0xe); + eob = _mm_max_epi16(eob, eob_shuffled); + eob_shuffled = _mm_shufflelo_epi16(eob, 0x1); + eob = _mm_max_epi16(eob, eob_shuffled); + return _mm_extract_epi16(eob, 1); +} diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c new file mode 100644 index 000000000..305dde5c0 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sse_avx2.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include +#include + +#include "config/aom_dsp_rtcd.h" + +#include "aom_ports/mem.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, + const uint8_t *b) { + const __m256i v_a0 = yy_loadu_256(a); + const __m256i v_b0 = yy_loadu_256(b); + const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0)); + const __m256i v_a01_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1)); + const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0)); + const __m256i v_b01_w = + _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1)); + const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); + const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w)); +} + +static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { + int64_t sum; + const __m256i sum0_4x64 = + _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all)); + const __m256i sum1_4x64 = + _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1)); + const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); + const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), + _mm256_extracti128_si256(sum_4x64, 1)); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + + xx_storel_64(&sum, sum_1x64); + return sum; +} + +int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_a2 = xx_loadl_32(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_32(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_b2 = xx_loadl_32(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_32(b + b_stride * 3); + const __m128i v_a0123 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3)); + const __m128i v_b0123 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3)); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m256i v_a_w = + _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1)); + const __m256i v_b_w = + _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_b0 = xx_loadu_128(b); + const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0); + const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + sse_w32_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + sse_w32_avx2(&sum, a, b); + sse_w32_avx2(&sum, a + 32, b + 32); + sse_w32_avx2(&sum, a + 64, b + 64); + sse_w32_avx2(&sum, a + 96, b + 96); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: break; + } + + return sse; +} + +static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a, + const uint16_t *b) { + const __m256i v_a_w = yy_loadu_256(a); + const __m256i v_b_w = yy_loadu_256(b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8, + int b_stride, int width, int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m256i sum = _mm256_setzero_si256(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_a2 = xx_loadl_64(a + a_stride * 2); + const __m128i v_a3 = xx_loadl_64(a + a_stride * 3); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_b2 = xx_loadl_64(b + b_stride * 2); + const __m128i v_b3 = xx_loadl_64(b + b_stride * 3); + const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1), + _mm_unpacklo_epi64(v_a2, v_a3)); + const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1), + _mm_unpacklo_epi64(v_b2, v_b3)); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 2; + b += b_stride << 2; + y += 4; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 8: + do { + const __m256i v_a_w = yy_loadu2_128(a + a_stride, a); + const __m256i v_b_w = yy_loadu2_128(b + b_stride, b); + const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); + sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 16: + do { + highbd_sse_w16_avx2(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 32: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 64: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + case 128: + do { + highbd_sse_w16_avx2(&sum, a, b); + highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1); + highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2); + highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3); + highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4); + highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5); + highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6); + highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_avx2(&sum); + break; + default: break; + } + return sse; +} diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c new file mode 100644 index 000000000..8b5af8469 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sse_sse4.c @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_ports/mem.h" +#include "aom/aom_integer.h" +#include "aom_dsp/x86/synonyms.h" + +static INLINE int64_t summary_all_sse4(const __m128i *sum_all) { + int64_t sum; + const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); + const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); + const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); + const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); + xx_storel_64(&sum, sum_1x64); + return sum; +} + +static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, + const uint8_t *b) { + const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_b0 = xx_loadu_128(b); + const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); + const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); + const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); + const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); +} + +int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int width, int height) { + int y = 0; + int64_t sse = 0; + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_32(a); + const __m128i v_a1 = xx_loadl_32(a + a_stride); + const __m128i v_b0 = xx_loadl_32(b); + const __m128i v_b1 = xx_loadl_32(b + b_stride); + const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); + const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); + const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + sse_w16_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16, b + 16); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + sse_w16_sse4_1(&sum, a, b); + sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); + sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); + sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); + sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); + sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); + sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); + sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: break; + } + + return sse; +} + +static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, + const uint16_t *b) { + const __m128i v_a_w = xx_loadu_128(a); + const __m128i v_b_w = xx_loadu_128(b); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); +} + +int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int width, + int height) { + int32_t y = 0; + int64_t sse = 0; + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + __m128i sum = _mm_setzero_si128(); + switch (width) { + case 4: + do { + const __m128i v_a0 = xx_loadl_64(a); + const __m128i v_a1 = xx_loadl_64(a + a_stride); + const __m128i v_b0 = xx_loadl_64(b); + const __m128i v_b1 = xx_loadl_64(b + b_stride); + const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); + const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); + const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); + sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w)); + a += a_stride << 1; + b += b_stride << 1; + y += 2; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 8: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 16: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8, b + 8); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 32: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 64: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + case 128: + do { + highbd_sse_w8_sse4_1(&sum, a, b); + highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1); + highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2); + highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3); + highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4); + highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5); + highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6); + highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7); + highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8); + highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9); + highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10); + highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11); + highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12); + highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13); + highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14); + highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15); + a += a_stride; + b += b_stride; + y += 1; + } while (y < height); + sse = summary_all_sse4(&sum); + break; + default: break; + } + return sse; +} diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c new file mode 100644 index 000000000..0af44e3a4 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" +#include "aom_dsp/x86/sum_squares_sse2.h" +#include "config/aom_dsp_rtcd.h" + +static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride, + int width, int height) { + uint64_t result; + __m256i v_acc_q = _mm256_setzero_si256(); + const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff); + for (int col = 0; col < height; col += 4) { + __m256i v_acc_d = _mm256_setzero_si256(); + for (int row = 0; row < width; row += 16) { + const int16_t *tempsrc = src + row; + const __m256i v_val_0_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride)); + const __m256i v_val_1_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride)); + const __m256i v_val_2_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride)); + const __m256i v_val_3_w = + _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride)); + + const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w); + const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w); + const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w); + const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w); + + const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d); + const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d); + const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d); + + v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d); + } + v_acc_q = + _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32)); + src += 4 * stride; + } + __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q); + __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1); + __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value); + + result_64_2_int = _mm_add_epi64( + result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int)); + + xx_storel_64(&result, result_64_2_int); + + return result; +} + +uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width, + int height) { + if (LIKELY(width == 4 && height == 4)) { + return aom_sum_squares_2d_i16_4x4_sse2(src, stride); + } else if (LIKELY(width == 4 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height); + } else if (LIKELY(width == 8 && (height & 3) == 0)) { + return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height); + } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) { + return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height); + } else { + return aom_sum_squares_2d_i16_c(src, stride, width, height); + } +} diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c index a79f22d79..22d7739ec 100644 --- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c @@ -14,6 +14,7 @@ #include #include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/sum_squares_sse2.h" #include "config/aom_dsp_rtcd.h" static INLINE __m128i xx_loadh_64(__m128i a, const void *b) { @@ -44,8 +45,7 @@ static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) { return _mm_add_epi32(v_sq_01_d, v_sq_23_d); } -static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, - int stride) { +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) { const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride); __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); @@ -53,8 +53,8 @@ static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, return (uint64_t)_mm_cvtsi128_si32(v_sum_d); } -static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, - int height) { +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height) { int r = 0; __m128i v_acc_q = _mm_setzero_si128(); do { @@ -76,7 +76,7 @@ static uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, // maintenance instructions in the common case of 4x4. __attribute__((noinline)) #endif -static uint64_t +uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width, int height) { int r = 0; diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h new file mode 100644 index 000000000..491e31cc5 --- /dev/null +++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_ +#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_ + +uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, + int width, int height); + +uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride, + int height); +uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride); + +#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index d9a53fcc5..1e9f1e27b 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_SYNONYMS_H_ -#define AOM_DSP_X86_SYNONYMS_H_ +#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_H_ #include @@ -103,15 +103,6 @@ static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) { return _mm_srai_epi32(v_tmp_d, bits); } -// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits) -static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) { - const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1); - const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31); - const __m128i v_tmp_d = - _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d); - return _mm_srai_epi32(v_tmp_d, bits); -} - static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1); const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15); @@ -120,4 +111,4 @@ static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) { return _mm_srai_epi16(v_tmp_d, bits); } -#endif // AOM_DSP_X86_SYNONYMS_H_ +#endif // AOM_AOM_DSP_X86_SYNONYMS_H_ diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h index 39f371fc9..3f69b120e 100644 --- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h +++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_SYNONYMS_AVX2_H_ -#define AOM_DSP_X86_SYNONYMS_AVX2_H_ +#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ +#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ #include @@ -61,4 +61,14 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); } -#endif // AOM_DSP_X86_SYNONYMS_AVX2_H_ +static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { + __m128i mhi = _mm_loadu_si128((__m128i *)(hi)); + __m128i mlo = _mm_loadu_si128((__m128i *)(lo)); + return yy_set_m128i(mhi, mlo); +} + +static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { + const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1); + return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256()); +} +#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h index f88a1527d..d0d1ee684 100644 --- a/third_party/aom/aom_dsp/x86/transpose_sse2.h +++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_TRANSPOSE_SSE2_H_ -#define AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ #include // SSE2 @@ -417,4 +417,4 @@ static INLINE void transpose_32bit_8x4(const __m128i *const in, out[7] = _mm_unpackhi_epi64(a6, a7); } -#endif // AOM_DSP_X86_TRANSPOSE_SSE2_H_ +#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h index bdff64b8f..b1611ba87 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H_ -#define AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ #include #include "aom/aom_integer.h" @@ -196,4 +196,4 @@ static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) { } #endif -#endif // AOM_DSP_X86_TXFM_COMMON_AVX2_H_ +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_ diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h index 58a792424..ed82eee96 100644 --- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h +++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h @@ -9,8 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_ -#define AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ #include #include "aom/aom_integer.h" @@ -26,4 +26,4 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) { return _mm_shuffle_epi32(b, 0x4e); } -#endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_ +#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_ diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c index a7ac2c93d..800aef126 100644 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -433,13 +433,14 @@ static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0, return comp; } -void aom_highbd_comp_mask_pred_avx2(uint16_t *comp_pred, const uint8_t *pred8, +void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i = 0; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); const uint16_t *src0 = invert_mask ? pred : ref; const uint16_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index 7e3c5d5db..3c37e77c0 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -16,6 +16,7 @@ #include "config/aom_dsp_rtcd.h" #include "config/av1_rtcd.h" +#include "aom_dsp/blend.h" #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" @@ -485,7 +486,8 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3, int subpel_y_q3, - const uint8_t *ref, int ref_stride) { + const uint8_t *ref, int ref_stride, + int subpel_search) { // expect xd == NULL only in tests if (xd != NULL) { const MB_MODE_INFO *mi = xd->mi[0]; @@ -553,7 +555,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; // Get convolve parameters. - ConvolveParams conv_params = get_conv_params(ref_num, 0, plane, xd->bd); + ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); const InterpFilters filters = av1_broadcast_interp_filter(EIGHTTAP_REGULAR); @@ -570,7 +572,10 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, } const InterpFilterParams *filter = - av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + (subpel_search == 1) + ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR) + : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8); + int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS; if (!subpel_x_q3 && !subpel_y_q3) { if (width >= 16) { @@ -632,15 +637,25 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1); const int16_t *const kernel_y = av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1); - const int intermediate_height = - (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps; + const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1); + uint8_t *temp_start_horiz = + (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp; + uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1); + int intermediate_height = + (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps; assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16); - aom_convolve8_horiz(ref - ref_stride * ((filter->taps >> 1) - 1), - ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1, - width, intermediate_height); - aom_convolve8_vert(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1), - MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16, - width, height); + // TODO(Deepa): Remove the memset below when we have + // 4 tap simd for sse2 and ssse3. + if (subpel_search == 1) { + memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width); + memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width); + memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width); + memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width); + } + aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE, + kernel_x, 16, NULL, -1, width, intermediate_height); + aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1, + kernel_y, 16, width, height); } } @@ -648,11 +663,11 @@ void aom_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride) { + int ref_stride, int subpel_search) { int n; int i; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, - subpel_x_q3, subpel_y_q3, ref, ref_stride); + subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search); /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/ assert(!(width * height & 15)); n = width * height >> 4; @@ -664,3 +679,128 @@ void aom_comp_avg_upsampled_pred_sse2( pred += 16; } } + +void aom_comp_mask_upsampled_pred_sse2( + MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, + const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, + int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, + int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask, + int subpel_search) { + if (subpel_x_q3 | subpel_y_q3) { + aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, + subpel_x_q3, subpel_y_q3, ref, ref_stride, + subpel_search); + ref = comp_pred; + ref_stride = width; + } + aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask, + mask_stride, invert_mask); +} + +static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0, + const __m128i s1, + const __m128i a) { + const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); + const __m128i round_const = + _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); + const __m128i a_inv = _mm_sub_epi16(alpha_max, a); + + const __m128i s_lo = _mm_unpacklo_epi16(s0, s1); + const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv); + const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo); + const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i s_hi = _mm_unpackhi_epi16(s0, s1); + const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv); + const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi); + const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const), + AOM_BLEND_A64_ROUND_BITS); + + const __m128i comp = _mm_packs_epi32(pred_l, pred_h); + + return comp; +} + +void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride, const uint8_t *mask, + int mask_stride, int invert_mask) { + int i = 0; + uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); + uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *src0 = invert_mask ? pred : ref; + const uint16_t *src1 = invert_mask ? ref : pred; + const int stride0 = invert_mask ? width : ref_stride; + const int stride1 = invert_mask ? ref_stride : width; + const __m128i zero = _mm_setzero_si128(); + + if (width == 8) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask); + const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 16) { + do { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0)); + const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1)); + const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)comp_pred, comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1); + + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } else if (width == 32) { + do { + for (int j = 0; j < 2; j++) { + const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16)); + const __m128i s2 = + _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16)); + const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16)); + const __m128i s3 = + _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16)); + + const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16)); + const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero); + const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero); + + const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16); + const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16); + + _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp); + _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1); + } + src0 += stride0; + src1 += stride1; + mask += mask_stride; + comp_pred += width; + i += 1; + } while (i < height); + } +} -- cgit v1.2.3